diff --git a/CHANGELOG.md b/CHANGELOG.md index cef1284eea540d0c196b6f17e07fce6590368adf..c552a742b076f5829272bb063254cfca8d5ab6bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -18,6 +18,10 @@ This changelog aims to follow the guiding principles of ### Added - Median Blur for 3x3 kernels. +### Changed +- Performance of Gaussian Blur is greatly improved in return for some accuracy. + (Except for binomial variants up to 7x7 kernel size.) + ## 0.5.0 - 2025-06-10 ### Added diff --git a/conformity/opencv/test_gaussian_blur.cpp b/conformity/opencv/test_gaussian_blur.cpp index d6eef123641f09e82ceacaaf0cc587500f09e175..ee72a405c24a5cb37adbe135cbe67ed7c7e9e3a8 100644 --- a/conformity/opencv/test_gaussian_blur.cpp +++ b/conformity/opencv/test_gaussian_blur.cpp @@ -57,11 +57,11 @@ bool test_gaussian_blur(int index, RecreatedMessageQueue& request_queue, reply_queue, input); uint8_t threshold = 0; - // There are currently rounding differences sometimes - // between the OpenCV and KleidiCV implementations that use - // the 15x15 kernel size, so we ignore any non-matching - // values that fall within the specified threshold. - if constexpr (KernelSize >= 15) { + // Bit-exact operation is only guaranteed for kernels up to 7x7, and only + // if the binomial variant is used. + // For bigger kernels, and for all the CustomSigma variants, a small + // difference is allowed. + if constexpr (KernelSize > 7 || !Binomial) { threshold = 2; } diff --git a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h index f95067a09e867664dacb91b75f6d90a5e89f8d8e..9427d8043715e3ec11e35f30b52cc566db115e8b 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_15x15_sc.h @@ -124,9 +124,11 @@ class SeparableFilter { svld1(pg, &src_rows.at(border_offsets.c13())[index]); SourceVectorType src_14 = svld1(pg, &src_rows.at(border_offsets.c14())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, src_7, src_8, src_9, src_10, src_11, - src_12, src_13, src_14, &dst_rows[index]); + + std::reference_wrapper sources[15] = { + src_0, src_1, src_2, src_3, src_4, src_5, src_6, src_7, + src_8, src_9, src_10, src_11, src_12, src_13, src_14}; + filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } void horizontal_vector_path_2x( @@ -180,14 +182,17 @@ class SeparableFilter { BufferVectorType src_0_14 = svld1(pg, &src_14[0]); BufferVectorType src_1_14 = svld1_vnum(pg, &src_14[0], 1); - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, src_0_5, src_0_6, src_0_7, src_0_8, - src_0_9, src_0_10, src_0_11, src_0_12, - src_0_13, src_0_14, &dst_rows[index]); + std::reference_wrapper sources_0[15] = { + src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, + src_0_5, src_0_6, src_0_7, src_0_8, src_0_9, + src_0_10, src_0_11, src_0_12, src_0_13, src_0_14}; + filter_.horizontal_vector_path(pg, sources_0, &dst_rows[index]); + std::reference_wrapper sources_1[15] = { + src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, + src_1_5, src_1_6, src_1_7, src_1_8, src_1_9, + src_1_10, src_1_11, src_1_12, src_1_13, src_1_14}; filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, - src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13, - src_1_14, &dst_rows[index + BufferVecTraits::num_lanes()]); + pg, sources_1, &dst_rows[index + BufferVecTraits::num_lanes()]); } void horizontal_vector_path(svbool_t pg, Rows src_rows, @@ -224,9 +229,10 @@ class SeparableFilter { svld1(pg, &src_rows.at(0, border_offsets.c13())[index]); BufferVectorType src_14 = svld1(pg, &src_rows.at(0, border_offsets.c14())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, src_7, src_8, src_9, src_10, src_11, - src_12, src_13, src_14, &dst_rows[index]); + std::reference_wrapper sources[15] = { + src_0, src_1, src_2, src_3, src_4, src_5, src_6, src_7, + src_8, src_9, src_10, src_11, src_12, src_13, src_14}; + filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } void process_horizontal_border( diff --git a/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h index a9ee82a9ce2c59cdea1157a20fd7f965705ef2ac..f164ea561b38ee066f8ec0fecf3f27cb7489b0de 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_21x21_sc.h @@ -136,10 +136,11 @@ class SeparableFilter { svld1(pg, &src_rows.at(border_offsets.c19())[index]); SourceVectorType src_20 = svld1(pg, &src_rows.at(border_offsets.c20())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, src_7, src_8, src_9, src_10, src_11, - src_12, src_13, src_14, src_15, src_16, src_17, - src_18, src_19, src_20, &dst_rows[index]); + std::reference_wrapper sources[21] = { + src_0, src_1, src_2, src_3, src_4, src_5, src_6, + src_7, src_8, src_9, src_10, src_11, src_12, src_13, + src_14, src_15, src_16, src_17, src_18, src_19, src_20}; + filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } void horizontal_vector_path_2x( @@ -211,16 +212,17 @@ class SeparableFilter { BufferVectorType src_0_20 = svld1(pg, &src_20[0]); BufferVectorType src_1_20 = svld1_vnum(pg, &src_20[0], 1); + std::reference_wrapper sources_0[21] = { + src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_0_5, src_0_6, + src_0_7, src_0_8, src_0_9, src_0_10, src_0_11, src_0_12, src_0_13, + src_0_14, src_0_15, src_0_16, src_0_17, src_0_18, src_0_19, src_0_20}; + filter_.horizontal_vector_path(pg, sources_0, &dst_rows[index]); + std::reference_wrapper sources_1[21] = { + src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, + src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13, + src_1_14, src_1_15, src_1_16, src_1_17, src_1_18, src_1_19, src_1_20}; filter_.horizontal_vector_path( - pg, src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_0_5, src_0_6, - src_0_7, src_0_8, src_0_9, src_0_10, src_0_11, src_0_12, src_0_13, - src_0_14, src_0_15, src_0_16, src_0_17, src_0_18, src_0_19, src_0_20, - &dst_rows[index]); - filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, - src_1_7, src_1_8, src_1_9, src_1_10, src_1_11, src_1_12, src_1_13, - src_1_14, src_1_15, src_1_16, src_1_17, src_1_18, src_1_19, src_1_20, - &dst_rows[index + BufferVecTraits::num_lanes()]); + pg, sources_1, &dst_rows[index + BufferVecTraits::num_lanes()]); } void horizontal_vector_path(svbool_t pg, Rows src_rows, @@ -269,10 +271,12 @@ class SeparableFilter { svld1(pg, &src_rows.at(0, border_offsets.c19())[index]); BufferVectorType src_20 = svld1(pg, &src_rows.at(0, border_offsets.c20())[index]); - filter_.horizontal_vector_path( - pg, src_0, src_1, src_2, src_3, src_4, src_5, src_6, src_7, src_8, - src_9, src_10, src_11, src_12, src_13, src_14, src_15, src_16, src_17, - src_18, src_19, src_20, &dst_rows[index]); + + std::reference_wrapper sources[21] = { + src_0, src_1, src_2, src_3, src_4, src_5, src_6, + src_7, src_8, src_9, src_10, src_11, src_12, src_13, + src_14, src_15, src_16, src_17, src_18, src_19, src_20}; + filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } void process_horizontal_border( diff --git a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h index 6f624ae1cca79a135837bcd0cc1db91d36b827b8..1b43ea2a43d91ffa17b1eb91c31829744c366101 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_3x3_sc.h @@ -100,7 +100,8 @@ class SeparableFilter { svld1(pg, &src_rows.at(border_offsets.c1())[index]); SourceVectorType src_2 = svld1(pg, &src_rows.at(border_offsets.c2())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); + std::reference_wrapper sources[3] = {src_0, src_1, src_2}; + filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } void horizontal_vector_path_2x( @@ -118,11 +119,13 @@ class SeparableFilter { BufferVectorType src_0_2 = svld1(pg, &src_2[0]); BufferVectorType src_1_2 = svld1_vnum(pg, &src_2[0], 1); - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, - &dst_rows[index]); + std::reference_wrapper sources_0[3] = {src_0_0, src_0_1, + src_0_2}; + filter_.horizontal_vector_path(pg, sources_0, &dst_rows[index]); + std::reference_wrapper sources_1[3] = {src_1_0, src_1_1, + src_1_2}; filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, - &dst_rows[index + BufferVecTraits::num_lanes()]); + pg, sources_1, &dst_rows[index + BufferVecTraits::num_lanes()]); } void horizontal_vector_path(svbool_t pg, Rows src_rows, @@ -135,7 +138,9 @@ class SeparableFilter { svld1(pg, &src_rows.at(0, border_offsets.c1())[index]); BufferVectorType src_2 = svld1(pg, &src_rows.at(0, border_offsets.c2())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, &dst_rows[index]); + + std::reference_wrapper sources[3] = {src_0, src_1, src_2}; + filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } void process_horizontal_border( diff --git a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h index 909e8ce18d2c4421295fd0e8653d043a7d67eb32..8e03d2ef993f0a49d9b91ca8caf744e3ceb586fb 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_5x5_sc.h @@ -104,8 +104,9 @@ class SeparableFilter { svld1(pg, &src_rows.at(border_offsets.c3())[index]); SourceVectorType src_4 = svld1(pg, &src_rows.at(border_offsets.c4())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); + std::reference_wrapper sources[5] = {src_0, src_1, src_2, + src_3, src_4}; + filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } void horizontal_vector_path_2x( @@ -128,12 +129,13 @@ class SeparableFilter { BufferVectorType src_1_3 = svld1_vnum(pg, &src_3[0], 1); BufferVectorType src_0_4 = svld1(pg, &src_4[0]); BufferVectorType src_1_4 = svld1_vnum(pg, &src_4[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, &dst_rows[index]); + std::reference_wrapper sources_0[5] = { + src_0_0, src_0_1, src_0_2, src_0_3, src_0_4}; + filter_.horizontal_vector_path(pg, sources_0, &dst_rows[index]); + std::reference_wrapper sources_1[5] = { + src_1_0, src_1_1, src_1_2, src_1_3, src_1_4}; filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, - &dst_rows[index + BufferVecTraits::num_lanes()]); + pg, sources_1, &dst_rows[index + BufferVecTraits::num_lanes()]); } void horizontal_vector_path(svbool_t pg, Rows src_rows, @@ -150,8 +152,9 @@ class SeparableFilter { svld1(pg, &src_rows.at(0, border_offsets.c3())[index]); BufferVectorType src_4 = svld1(pg, &src_rows.at(0, border_offsets.c4())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, - &dst_rows[index]); + std::reference_wrapper sources[5] = {src_0, src_1, src_2, + src_3, src_4}; + filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } void process_horizontal_border( diff --git a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h index 33f204a10361991b726c42b692a4807b20049e31..0f9ffcac727cb0e7679f02f51ebaf8004dc21ffa 100644 --- a/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h +++ b/kleidicv/include/kleidicv/filters/separable_filter_7x7_sc.h @@ -108,8 +108,9 @@ class SeparableFilter { svld1(pg, &src_rows.at(border_offsets.c5())[index]); SourceVectorType src_6 = svld1(pg, &src_rows.at(border_offsets.c6())[index]); - filter_.vertical_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, &dst_rows[index]); + std::reference_wrapper sources[7] = { + src_0, src_1, src_2, src_3, src_4, src_5, src_6}; + filter_.vertical_vector_path(pg, sources, &dst_rows[index]); } void horizontal_vector_path_2x( @@ -138,12 +139,13 @@ class SeparableFilter { BufferVectorType src_1_5 = svld1_vnum(pg, &src_5[0], 1); BufferVectorType src_0_6 = svld1(pg, &src_6[0]); BufferVectorType src_1_6 = svld1_vnum(pg, &src_6[0], 1); - - filter_.horizontal_vector_path(pg, src_0_0, src_0_1, src_0_2, src_0_3, - src_0_4, src_0_5, src_0_6, &dst_rows[index]); + std::reference_wrapper sources_0[7] = { + src_0_0, src_0_1, src_0_2, src_0_3, src_0_4, src_0_5, src_0_6}; + filter_.horizontal_vector_path(pg, sources_0, &dst_rows[index]); + std::reference_wrapper sources_1[7] = { + src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6}; filter_.horizontal_vector_path( - pg, src_1_0, src_1_1, src_1_2, src_1_3, src_1_4, src_1_5, src_1_6, - &dst_rows[index + BufferVecTraits::num_lanes()]); + pg, sources_1, &dst_rows[index + BufferVecTraits::num_lanes()]); } void horizontal_vector_path(svbool_t pg, Rows src_rows, @@ -164,8 +166,10 @@ class SeparableFilter { svld1(pg, &src_rows.at(0, border_offsets.c5())[index]); BufferVectorType src_6 = svld1(pg, &src_rows.at(0, border_offsets.c6())[index]); - filter_.horizontal_vector_path(pg, src_0, src_1, src_2, src_3, src_4, src_5, - src_6, &dst_rows[index]); + + std::reference_wrapper sources[7] = { + src_0, src_1, src_2, src_3, src_4, src_5, src_6}; + filter_.horizontal_vector_path(pg, sources, &dst_rows[index]); } void process_horizontal_border( diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index c560227961e6c48729aac5a0a3a53f309124d3ba..14c2f6a5e4c4a3bb281590ad0fcc7b005ef5682b 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -38,8 +38,6 @@ class GaussianBlur { using BufferVectorType = typename VecTraits::VectorType; using DestinationType = ScalarType; - explicit GaussianBlur(float sigma [[maybe_unused]]) {} - // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T @@ -99,7 +97,7 @@ class GaussianBlur { using BufferType = uint16_t; using DestinationType = uint8_t; - explicit GaussianBlur(float sigma [[maybe_unused]]) + GaussianBlur() : const_6_u8_half_{vdup_n_u8(6)}, const_6_u16_{vdupq_n_u16(6)}, const_4_u16_{vdupq_n_u16(4)} {} @@ -180,7 +178,7 @@ class GaussianBlur { using BufferType = uint16_t; using DestinationType = uint8_t; - explicit GaussianBlur(float sigma [[maybe_unused]]) + GaussianBlur() : const_7_u16_{vdupq_n_u16(7)}, const_7_u32_{vdupq_n_u32(7)}, const_9_u16_{vdupq_n_u16(9)} {} @@ -294,343 +292,77 @@ class GaussianBlur { uint16x8_t const_9_u16_; }; // end of class GaussianBlur -// Template for 15x15 Gaussian Blur binomial filters. -// -// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] -// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] -// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] -// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] -// F = 1/1048576 * [ | | | | ... | | | | ] = -// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] -// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] -// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] -// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] -// -// [ 4 ] -// [ 11 ] -// [ 25 ] -// [ 48 ] -// [ 81 ] -// [ 118 ] -// [ 146 ] -// = 1/1048576 * [ 158 ] * [4,11,25,48,81,118,146,158,146,118,81,48,25,11,4] -// [ 146 ] -// [ 118 ] -// [ 81 ] -// [ 48 ] -// [ 25 ] -// [ 11 ] -// [ 4 ] -template <> -class GaussianBlur { - public: - using SourceType = uint8_t; - using BufferType = uint32_t; - using DestinationType = uint8_t; - - explicit GaussianBlur(float sigma [[maybe_unused]]) - : const_11_u16_{vdupq_n_u16(11)}, - const_11_u32_{vdupq_n_u32(11)}, - const_25_u16_{vdupq_n_u16(25)}, - const_25_u32_{vdupq_n_u32(25)}, - const_81_u16_{vdupq_n_u16(81)}, - const_81_u32_{vdupq_n_u32(81)}, - const_118_u16_half_{vdup_n_u16(118)}, - const_118_u32_{vdupq_n_u32(118)}, - const_146_u16_half_{vdup_n_u16(146)}, - const_146_u32_{vdupq_n_u32(146)}, - const_158_u16_half_{vdup_n_u16(158)}, - const_158_u32_{vdupq_n_u32(158)} {} - - // Applies vertical filtering vector using SIMD operations. - // - // DST = [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * - // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T - void vertical_vector_path(uint8x16_t src[15], BufferType *dst) const { - uint16x8_t acc_7_l = vmovl_u8(vget_low_u8(src[7])); - uint16x8_t acc_7_h = vmovl_u8(vget_high_u8(src[7])); - - uint16x8_t acc_1_13_l = vaddl_u8(vget_low_u8(src[1]), vget_low_u8(src[13])); - uint16x8_t acc_1_13_h = - vaddl_u8(vget_high_u8(src[1]), vget_high_u8(src[13])); - - uint16x8_t acc_2_12_l = vaddl_u8(vget_low_u8(src[2]), vget_low_u8(src[12])); - uint16x8_t acc_2_12_h = - vaddl_u8(vget_high_u8(src[2]), vget_high_u8(src[12])); - - uint16x8_t acc_6_8_l = vaddl_u8(vget_low_u8(src[6]), vget_low_u8(src[8])); - uint16x8_t acc_6_8_h = vaddl_u8(vget_high_u8(src[6]), vget_high_u8(src[8])); - - uint16x8_t acc_5_9_l = vaddl_u8(vget_low_u8(src[5]), vget_low_u8(src[9])); - uint16x8_t acc_5_9_h = vaddl_u8(vget_high_u8(src[5]), vget_high_u8(src[9])); - - uint16x8_t acc_0_14_l = vaddl_u8(vget_low_u8(src[0]), vget_low_u8(src[14])); - uint16x8_t acc_0_14_h = - vaddl_u8(vget_high_u8(src[0]), vget_high_u8(src[14])); - - uint16x8_t acc_3_11_l = vaddl_u8(vget_low_u8(src[3]), vget_low_u8(src[11])); - uint16x8_t acc_3_11_h = - vaddl_u8(vget_high_u8(src[3]), vget_high_u8(src[11])); - - uint16x8_t acc_4_10_l = vaddl_u8(vget_low_u8(src[4]), vget_low_u8(src[10])); - uint16x8_t acc_4_10_h = - vaddl_u8(vget_high_u8(src[4]), vget_high_u8(src[10])); - - acc_0_14_l = vshlq_n_u16(acc_0_14_l, 2); - acc_0_14_h = vshlq_n_u16(acc_0_14_h, 2); - - acc_3_11_l = vshlq_n_u16(acc_3_11_l, 2); - acc_3_11_h = vshlq_n_u16(acc_3_11_h, 2); - - acc_4_10_l = vmulq_u16(acc_4_10_l, const_81_u16_); - acc_4_10_h = vmulq_u16(acc_4_10_h, const_81_u16_); - - uint16x8_t acc_1_3_11_13_l = vaddq_u16(acc_3_11_l, acc_1_13_l); - uint16x8_t acc_1_3_11_13_h = vaddq_u16(acc_3_11_h, acc_1_13_h); - acc_1_3_11_13_l = vmlaq_u16(acc_3_11_l, acc_1_3_11_13_l, const_11_u16_); - acc_1_3_11_13_h = vmlaq_u16(acc_3_11_h, acc_1_3_11_13_h, const_11_u16_); - - uint16x8_t acc_0_1_3_11_13_14_l = vaddq_u16(acc_1_3_11_13_l, acc_0_14_l); - uint16x8_t acc_0_1_3_11_13_14_h = vaddq_u16(acc_1_3_11_13_h, acc_0_14_h); - - uint16x8_t acc_2_4_10_12_l = - vmlaq_u16(acc_4_10_l, acc_2_12_l, const_25_u16_); - uint16x8_t acc_2_4_10_12_h = - vmlaq_u16(acc_4_10_h, acc_2_12_h, const_25_u16_); - - uint32x4x4_t acc = {{ - vaddl_u16(vget_low_u16(acc_2_4_10_12_l), - vget_low_u16(acc_0_1_3_11_13_14_l)), - vaddl_u16(vget_high_u16(acc_2_4_10_12_l), - vget_high_u16(acc_0_1_3_11_13_14_l)), - vaddl_u16(vget_low_u16(acc_2_4_10_12_h), - vget_low_u16(acc_0_1_3_11_13_14_h)), - vaddl_u16(vget_high_u16(acc_2_4_10_12_h), - vget_high_u16(acc_0_1_3_11_13_14_h)), - }}; - - acc.val[0] = - vmlal_u16(acc.val[0], vget_low_u16(acc_6_8_l), const_146_u16_half_); - acc.val[1] = - vmlal_u16(acc.val[1], vget_high_u16(acc_6_8_l), const_146_u16_half_); - acc.val[2] = - vmlal_u16(acc.val[2], vget_low_u16(acc_6_8_h), const_146_u16_half_); - acc.val[3] = - vmlal_u16(acc.val[3], vget_high_u16(acc_6_8_h), const_146_u16_half_); - - acc.val[0] = - vmlal_u16(acc.val[0], vget_low_u16(acc_5_9_l), const_118_u16_half_); - acc.val[1] = - vmlal_u16(acc.val[1], vget_high_u16(acc_5_9_l), const_118_u16_half_); - acc.val[2] = - vmlal_u16(acc.val[2], vget_low_u16(acc_5_9_h), const_118_u16_half_); - acc.val[3] = - vmlal_u16(acc.val[3], vget_high_u16(acc_5_9_h), const_118_u16_half_); - - acc.val[0] = - vmlal_u16(acc.val[0], vget_low_u16(acc_7_l), const_158_u16_half_); - acc.val[1] = - vmlal_u16(acc.val[1], vget_high_u16(acc_7_l), const_158_u16_half_); - acc.val[2] = - vmlal_u16(acc.val[2], vget_low_u16(acc_7_h), const_158_u16_half_); - acc.val[3] = - vmlal_u16(acc.val[3], vget_high_u16(acc_7_h), const_158_u16_half_); - neon::VecTraits::store(acc, &dst[0]); - } - - // Applies vertical filtering vector using scalar operations. - // - // DST = [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * - // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T - void vertical_scalar_path(const SourceType src[15], BufferType *dst) const { - uint32_t acc = (static_cast(src[3]) + src[11]) * 4; - acc += (acc + src[1] + src[13]) * 11; - acc += (src[0] + src[14]) * 4 + (src[2] + src[12]) * 25 + - (src[4] + src[10]) * 81; - acc += (src[5] + src[9]) * 118 + (src[6] + src[8]) * 146 + src[7] * 158; - dst[0] = acc; - } - - // Applies horizontal filtering vector using SIMD operations. - // - // DST = 1/1048576 * [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * - // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T - void horizontal_vector_path(uint32x4_t src[15], DestinationType *dst) const { - uint32x4_t acc_1_13 = vaddq_u32(src[1], src[13]); - uint32x4_t acc_2_12 = vaddq_u32(src[2], src[12]); - uint32x4_t acc_6_8 = vaddq_u32(src[6], src[8]); - uint32x4_t acc_5_9 = vaddq_u32(src[5], src[9]); - uint32x4_t acc_0_14 = vaddq_u32(src[0], src[14]); - uint32x4_t acc_3_11 = vaddq_u32(src[3], src[11]); - uint32x4_t acc_4_10 = vaddq_u32(src[4], src[10]); - - acc_0_14 = vshlq_n_u32(acc_0_14, 2); - acc_3_11 = vshlq_n_u32(acc_3_11, 2); - acc_4_10 = vmulq_u32(acc_4_10, const_81_u32_); - - uint32x4_t acc_1_3_11_13 = vaddq_u32(acc_3_11, acc_1_13); - acc_1_3_11_13 = vmlaq_u32(acc_3_11, acc_1_3_11_13, const_11_u32_); - uint32x4_t acc_0_1_3_11_13_14 = vaddq_u32(acc_1_3_11_13, acc_0_14); - uint32x4_t acc_2_4_10_12 = vmlaq_u32(acc_4_10, acc_2_12, const_25_u32_); - - uint32x4_t acc = vaddq_u32(acc_2_4_10_12, acc_0_1_3_11_13_14); - acc = vmlaq_u32(acc, acc_6_8, const_146_u32_); - acc = vmlaq_u32(acc, acc_5_9, const_118_u32_); - acc = vmlaq_u32(acc, src[7], const_158_u32_); - acc = vrshrq_n_u32(acc, 20); - - uint16x4_t narrowed = vmovn_u32(acc); - uint8x8_t interleaved = - vuzp1_u8(vreinterpret_u8_u16(narrowed), vreinterpret_u8_u16(narrowed)); - uint32_t result = vget_lane_u32(vreinterpret_u32_u8(interleaved), 0); - memcpy(&dst[0], &result, sizeof(result)); - } - - // Applies horizontal filtering vector using scalar operations. - // - // DST = 1/1048576 * [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * - // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T - void horizontal_scalar_path(const BufferType src[15], - DestinationType *dst) const { - uint32_t acc = (static_cast(src[3]) + src[11]) * 4; - acc += (acc + src[1] + src[13]) * 11; - acc += (src[0] + src[14]) * 4 + (src[2] + src[12]) * 25 + - (src[4] + src[10]) * 81; - acc += (src[5] + src[9]) * 118 + (src[6] + src[8]) * 146 + src[7] * 158; - dst[0] = static_cast(rounding_shift_right(acc, 20)); - } - - private: - uint16x8_t const_11_u16_; - uint32x4_t const_11_u32_; - uint16x8_t const_25_u16_; - uint32x4_t const_25_u32_; - uint16x8_t const_81_u16_; - uint32x4_t const_81_u32_; - uint16x4_t const_118_u16_half_; - uint32x4_t const_118_u32_; - uint16x4_t const_146_u16_half_; - uint32x4_t const_146_u32_; - uint16x4_t const_158_u16_half_; - uint32x4_t const_158_u32_; -}; // end of class GaussianBlur - template class GaussianBlur { public: using SourceType = uint8_t; - using BufferType = uint32_t; + using BufferType = uint8_t; using DestinationType = uint8_t; static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); - // Ignored because vectors are initialized in the constructor body. - // NOLINTNEXTLINE - hicpp-member-init - explicit GaussianBlur(float sigma) - : half_kernel_(generate_gaussian_half_kernel(sigma)) { - for (size_t i = 0; i < kHalfKernelSize; i++) { - half_kernel_u16_[i] = vdupq_n_u16(half_kernel_[i]); - half_kernel_u32_[i] = vdupq_n_u32(half_kernel_[i]); - } - } + explicit GaussianBlur(const std::array half_kernel) + : half_kernel_(half_kernel) {} void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const { - uint16x8_t initial_l = vmovl_u8(vget_low_u8(src[KernelSize >> 1])); - uint16x8_t initial_h = vmovl_high_u8(src[KernelSize >> 1]); - - uint32x4_t acc_l_l = - vmull_u16(vget_low_u16(initial_l), - vget_low_u16(half_kernel_u16_[KernelSize >> 1])); - uint32x4_t acc_l_h = - vmull_high_u16(initial_l, half_kernel_u16_[KernelSize >> 1]); - uint32x4_t acc_h_l = - vmull_u16(vget_low_u16(initial_h), - vget_low_u16(half_kernel_u16_[KernelSize >> 1])); - uint32x4_t acc_h_h = - vmull_high_u16(initial_h, half_kernel_u16_[KernelSize >> 1]); - - // Optimization to avoid unnecessary branching in vector code. - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = 0; i < (KernelSize >> 1); i++) { - const size_t j = KernelSize - i - 1; - uint16x8_t vec_l = vaddl_u8(vget_low_u8(src[i]), vget_low_u8(src[j])); - uint16x8_t vec_h = vaddl_high_u8(src[i], src[j]); - - acc_l_l = vmlal_u16(acc_l_l, vget_low_u16(vec_l), - vget_low_u16(half_kernel_u16_[i])); - acc_l_h = vmlal_high_u16(acc_l_h, vec_l, half_kernel_u16_[i]); - acc_h_l = vmlal_u16(acc_h_l, vget_low_u16(vec_h), - vget_low_u16(half_kernel_u16_[i])); - acc_h_h = vmlal_high_u16(acc_h_h, vec_h, half_kernel_u16_[i]); - } - - uint32x4x4_t result = {acc_l_l, acc_l_h, acc_h_l, acc_h_h}; - neon::VecTraits::store(result, &dst[0]); + common_vector_path(src, dst); } void vertical_scalar_path(const SourceType src[KernelSize], BufferType *dst) const { - BufferType acc = static_cast(src[0]) * half_kernel_[0]; + uint16_t acc = static_cast(src[kHalfKernelSize - 1]) * + half_kernel_[kHalfKernelSize - 1]; // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = 1; i <= (KernelSize >> 1); i++) { - acc += static_cast(src[i]) * half_kernel_[i]; + for (size_t i = 0; i < kHalfKernelSize - 1; i++) { + acc += (static_cast(src[i]) + + static_cast(src[KernelSize - i - 1])) * + half_kernel_[i]; } - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = (KernelSize >> 1) + 1; i < KernelSize; i++) { - size_t j = KernelSize - i - 1; - acc += static_cast(src[i]) * half_kernel_[j]; - } - - dst[0] = acc; + dst[0] = static_cast(rounding_shift_right(acc, 8)); } - void horizontal_vector_path(uint32x4_t src[KernelSize], + void horizontal_vector_path(uint8x16_t src[KernelSize], DestinationType *dst) const { - uint32x4_t acc = - vmulq_u32(src[KernelSize >> 1], half_kernel_u32_[KernelSize >> 1]); - - // Optimization to avoid unnecessary branching in vector code. - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = 0; i < (KernelSize >> 1); i++) { - const size_t j = KernelSize - i - 1; - uint32x4_t vec_inner = vaddq_u32(src[i], src[j]); - acc = vmlaq_u32(acc, vec_inner, half_kernel_u32_[i]); - } - - uint32x4_t acc_u32 = vrshrq_n_u32(acc, 16); - uint16x4_t narrowed = vmovn_u32(acc_u32); - uint8x8_t interleaved = - vuzp1_u8(vreinterpret_u8_u16(narrowed), vreinterpret_u8_u16(narrowed)); - uint32_t result = vget_lane_u32(vreinterpret_u32_u8(interleaved), 0); - memcpy(&dst[0], &result, sizeof(result)); + common_vector_path(src, dst); } void horizontal_scalar_path(const BufferType src[KernelSize], DestinationType *dst) const { - BufferType acc = src[0] * half_kernel_[0]; + vertical_scalar_path(src, dst); + } + + private: + void common_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const { + uint8x8_t half_kernel_mid = vdup_n_u8(half_kernel_[kHalfKernelSize - 1]); + uint16x8_t acc_l = + vmlal_u8(vdupq_n_u16(128), vget_low_u8(src[kHalfKernelSize - 1]), + half_kernel_mid); + uint16x8_t acc_h = + vmlal_u8(vdupq_n_u16(128), vget_high_u8(src[kHalfKernelSize - 1]), + half_kernel_mid); // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = 1; i <= (KernelSize >> 1); i++) { - acc += src[i] * half_kernel_[i]; - } + for (size_t i = 0; i < kHalfKernelSize - 1; i++) { + const size_t j = KernelSize - i - 1; + uint16x8_t vec_l = vaddl_u8(vget_low_u8(src[i]), vget_low_u8(src[j])); + uint16x8_t vec_h = vaddl_high_u8(src[i], src[j]); + uint16x8_t coeff = vdupq_n_u16(half_kernel_[i]); - KLEIDICV_FORCE_LOOP_UNROLL - for (size_t i = (KernelSize >> 1) + 1; i < KernelSize; i++) { - size_t j = KernelSize - i - 1; - acc += src[i] * half_kernel_[j]; + acc_l = vmlaq_u16(acc_l, vec_l, coeff); + acc_h = vmlaq_u16(acc_h, vec_h, coeff); } - dst[0] = static_cast(rounding_shift_right(acc, 16)); + // Keep only the highest 8 bits + uint8x16_t result = + vuzp2q_u8(vreinterpretq_u8_u16(acc_l), vreinterpretq_u8_u16(acc_h)); + neon::VecTraits::store(result, &dst[0]); } - private: const std::array half_kernel_; - uint16x8_t half_kernel_u16_[kHalfKernelSize]; - uint32x4_t half_kernel_u32_[kHalfKernelSize]; }; // end of class GaussianBlur template @@ -641,15 +373,35 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( SeparableFilterWorkspace *workspace) { using GaussianBlurFilter = GaussianBlur; - GaussianBlurFilter blur{sigma}; - SeparableFilter filter{blur}; - Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; - workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - border_type, filter); - return KLEIDICV_OK; + if constexpr (IsBinomial) { + GaussianBlurFilter blur; + SeparableFilter filter{blur}; + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, + border_type, filter); + + return KLEIDICV_OK; + } else { + constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); + auto half_kernel = generate_gaussian_half_kernel(sigma); + // If sigma is so small that the middle point gets all the weights, it's + // just a copy + if (half_kernel[kHalfKernelSize - 1] < 256) { + GaussianBlurFilter blur(half_kernel); + SeparableFilter filter{blur}; + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, + border_type, filter); + } else { + for (size_t row = y_begin; row < y_end; ++row) { + std::memcpy(static_cast(&dst_rows.at(row)[0]), + static_cast(&src_rows.at(row)[0]), + rect.width() * sizeof(ScalarType) * dst_rows.channels()); + } + } + return KLEIDICV_OK; + } } template @@ -674,7 +426,8 @@ static kleidicv_error_t gaussian_blur(size_t kernel_size, const ScalarType *src, src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, sigma, border_type, workspace); case 15: - return gaussian_blur_fixed_kernel_size<15, IsBinomial>( + // 15x15 does not have a binomial variant + return gaussian_blur_fixed_kernel_size<15, false>( src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, sigma, border_type, workspace); case 21: @@ -682,8 +435,7 @@ static kleidicv_error_t gaussian_blur(size_t kernel_size, const ScalarType *src, return gaussian_blur_fixed_kernel_size<21, false>( src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, sigma, border_type, workspace); - - // gaussian_blur_is_implemented checked the kernel size already. + // gaussian_blur_is_implemented checked the kernel size already. // GCOVR_EXCL_START default: assert(!"kernel size not implemented"); diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index 936ee012a09f7640ec043388f09252449d8d87bd..50494fbd950ccb6c31c99d382d32ce4f3181e685 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -35,20 +35,17 @@ class GaussianBlur { using BufferType = uint16_t; using DestinationType = uint8_t; - explicit GaussianBlur([[maybe_unused]] float sigma) - KLEIDICV_STREAMING_COMPATIBLE {} - // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, - svuint8_t src_2, BufferType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - svuint16_t acc_0_2_b = svaddlb_u16(src_0, src_2); - svuint16_t acc_0_2_t = svaddlt_u16(src_0, src_2); + void vertical_vector_path( + svbool_t pg, std::reference_wrapper src[3], + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t acc_0_2_b = svaddlb_u16(src[0], src[2]); + svuint16_t acc_0_2_t = svaddlt_u16(src[0], src[2]); - svuint16_t acc_1_b = svshllb_n_u16(src_1, 1); - svuint16_t acc_1_t = svshllt_n_u16(src_1, 1); + svuint16_t acc_1_b = svshllb_n_u16(src[1], 1); + svuint16_t acc_1_t = svshllt_n_u16(src[1], 1); svuint16_t acc_u16_b = svadd_u16_x(pg, acc_0_2_b, acc_1_b); svuint16_t acc_u16_t = svadd_u16_x(pg, acc_0_2_t, acc_1_t); @@ -60,12 +57,12 @@ class GaussianBlur { // Applies horizontal filtering vector using SIMD operations. // // DST = 1/16 * [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_vector_path(svbool_t pg, svuint16_t src_0, svuint16_t src_1, - svuint16_t src_2, DestinationType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - svuint16_t acc_0_2 = svhadd_u16_x(pg, src_0, src_2); + void horizontal_vector_path( + svbool_t pg, std::reference_wrapper src[3], + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t acc_0_2 = svhadd_u16_x(pg, src[0], src[2]); - svuint16_t acc = svadd_u16_x(pg, acc_0_2, src_1); + svuint16_t acc = svadd_u16_x(pg, acc_0_2, src[1]); acc = svrshr_x(pg, acc, 3); svst1b(pg, &dst[0], acc); @@ -95,23 +92,19 @@ class GaussianBlur { using BufferType = uint16_t; using DestinationType = uint8_t; - explicit GaussianBlur([[maybe_unused]] float sigma) - KLEIDICV_STREAMING_COMPATIBLE {} - // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T - void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, - svuint8_t src_2, svuint8_t src_3, svuint8_t src_4, - BufferType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - svuint16_t acc_0_4_b = svaddlb_u16(src_0, src_4); - svuint16_t acc_0_4_t = svaddlt_u16(src_0, src_4); - svuint16_t acc_1_3_b = svaddlb_u16(src_1, src_3); - svuint16_t acc_1_3_t = svaddlt_u16(src_1, src_3); + void vertical_vector_path( + svbool_t pg, std::reference_wrapper src[5], + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t acc_0_4_b = svaddlb_u16(src[0], src[4]); + svuint16_t acc_0_4_t = svaddlt_u16(src[0], src[4]); + svuint16_t acc_1_3_b = svaddlb_u16(src[1], src[3]); + svuint16_t acc_1_3_t = svaddlt_u16(src[1], src[3]); - svuint16_t acc_u16_b = svmlalb_n_u16(acc_0_4_b, src_2, 6); - svuint16_t acc_u16_t = svmlalt_n_u16(acc_0_4_t, src_2, 6); + svuint16_t acc_u16_b = svmlalb_n_u16(acc_0_4_b, src[2], 6); + svuint16_t acc_u16_t = svmlalt_n_u16(acc_0_4_t, src[2], 6); acc_u16_b = svmla_n_u16_x(pg, acc_u16_b, acc_1_3_b, 4); acc_u16_t = svmla_n_u16_x(pg, acc_u16_t, acc_1_3_t, 4); @@ -122,13 +115,12 @@ class GaussianBlur { // Applies horizontal filtering vector using SIMD operations. // // DST = 1/256 * [ SRC0, SRC1, SRC2, SRC3, SRC4 ] * [ 1, 4, 6, 4, 1 ]T - void horizontal_vector_path(svbool_t pg, svuint16_t src_0, svuint16_t src_1, - svuint16_t src_2, svuint16_t src_3, - svuint16_t src_4, DestinationType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - svuint16_t acc_0_4 = svadd_x(pg, src_0, src_4); - svuint16_t acc_1_3 = svadd_x(pg, src_1, src_3); - svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src_2, 6); + void horizontal_vector_path( + svbool_t pg, std::reference_wrapper src[5], + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t acc_0_4 = svadd_x(pg, src[0], src[4]); + svuint16_t acc_1_3 = svadd_x(pg, src[1], src[3]); + svuint16_t acc = svmla_n_u16_x(pg, acc_0_4, src[2], 6); acc = svmla_n_u16_x(pg, acc, acc_1_3, 4); acc = svrshr_x(pg, acc, 8); svst1b(pg, &dst[0], acc); @@ -168,28 +160,24 @@ class GaussianBlur { using BufferType = uint16_t; using DestinationType = uint8_t; - explicit GaussianBlur([[maybe_unused]] float sigma) - KLEIDICV_STREAMING_COMPATIBLE {} - // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * // * [ 2, 7, 14, 18, 14, 7, 2 ]T void vertical_vector_path( - svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, - svuint8_t src_3, svuint8_t src_4, svuint8_t src_5, svuint8_t src_6, + svbool_t pg, std::reference_wrapper src[7], BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svuint16_t acc_0_6_b = svaddlb_u16(src_0, src_6); - svuint16_t acc_0_6_t = svaddlt_u16(src_0, src_6); + svuint16_t acc_0_6_b = svaddlb_u16(src[0], src[6]); + svuint16_t acc_0_6_t = svaddlt_u16(src[0], src[6]); - svuint16_t acc_1_5_b = svaddlb_u16(src_1, src_5); - svuint16_t acc_1_5_t = svaddlt_u16(src_1, src_5); + svuint16_t acc_1_5_b = svaddlb_u16(src[1], src[5]); + svuint16_t acc_1_5_t = svaddlt_u16(src[1], src[5]); - svuint16_t acc_2_4_b = svaddlb_u16(src_2, src_4); - svuint16_t acc_2_4_t = svaddlt_u16(src_2, src_4); + svuint16_t acc_2_4_b = svaddlb_u16(src[2], src[4]); + svuint16_t acc_2_4_t = svaddlt_u16(src[2], src[4]); - svuint16_t acc_3_b = svmovlb_u16(src_3); - svuint16_t acc_3_t = svmovlt_u16(src_3); + svuint16_t acc_3_b = svmovlb_u16(src[3]); + svuint16_t acc_3_t = svmovlt_u16(src[3]); svuint16_t acc_0_2_4_6_b = svmla_n_u16_x(pg, acc_0_6_b, acc_2_4_b, 7); svuint16_t acc_0_2_4_6_t = svmla_n_u16_x(pg, acc_0_6_t, acc_2_4_t, 7); @@ -214,22 +202,21 @@ class GaussianBlur { // DST = 1/4096 * [ SRC0, SRC1, SRC2, SRC3, SRC4, SRC5, SRC6 ] * // * [ 2, 7, 14, 18, 14, 7, 2 ]T void horizontal_vector_path( - svbool_t pg, svuint16_t src_0, svuint16_t src_1, svuint16_t src_2, - svuint16_t src_3, svuint16_t src_4, svuint16_t src_5, svuint16_t src_6, + svbool_t pg, std::reference_wrapper src[7], DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svuint32_t acc_0_6_b = svaddlb_u32(src_0, src_6); - svuint32_t acc_0_6_t = svaddlt_u32(src_0, src_6); + svuint32_t acc_0_6_b = svaddlb_u32(src[0], src[6]); + svuint32_t acc_0_6_t = svaddlt_u32(src[0], src[6]); - svuint32_t acc_1_5_b = svaddlb_u32(src_1, src_5); - svuint32_t acc_1_5_t = svaddlt_u32(src_1, src_5); + svuint32_t acc_1_5_b = svaddlb_u32(src[1], src[5]); + svuint32_t acc_1_5_t = svaddlt_u32(src[1], src[5]); - svuint16_t acc_2_4 = svadd_u16_x(pg, src_2, src_4); + svuint16_t acc_2_4 = svadd_u16_x(pg, src[2], src[4]); svuint32_t acc_0_2_4_6_b = svmlalb_n_u32(acc_0_6_b, acc_2_4, 7); svuint32_t acc_0_2_4_6_t = svmlalt_n_u32(acc_0_6_t, acc_2_4, 7); - svuint32_t acc_0_2_3_4_6_b = svmlalb_n_u32(acc_0_2_4_6_b, src_3, 9); - svuint32_t acc_0_2_3_4_6_t = svmlalt_n_u32(acc_0_2_4_6_t, src_3, 9); + svuint32_t acc_0_2_3_4_6_b = svmlalb_n_u32(acc_0_2_4_6_b, src[3], 9); + svuint32_t acc_0_2_3_4_6_t = svmlalt_n_u32(acc_0_2_4_6_t, src[3], 9); acc_0_2_3_4_6_b = svlsl_n_u32_x(pg, acc_0_2_3_4_6_b, 1); acc_0_2_3_4_6_t = svlsl_n_u32_x(pg, acc_0_2_3_4_6_t, 1); @@ -259,750 +246,88 @@ class GaussianBlur { } }; // end of class GaussianBlur -// Template for 15x15 Gaussian Blur binomial filters. -// -// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] -// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] -// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] -// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] -// F = 1/1048576 * [ | | | | ... | | | | ] = -// [ 192, 528, 1200, 2304 ... 2304, 1200, 528, 192 ] -// [ 100, 275, 625, 1200 ... 1200, 625, 275, 100 ] -// [ 44, 121, 275, 528 ... 528, 275, 121, 44 ] -// [ 16, 44, 100, 192 ... 192, 100, 44, 16 ] -// -// [ 4 ] -// [ 11 ] -// [ 25 ] -// [ 48 ] -// [ 81 ] -// [ 118 ] -// [ 146 ] -// = 1/1048576 * [ 158 ] * [4,11,25,48,81,118,146,158,146,118,81,48,25,11,4] -// [ 146 ] -// [ 118 ] -// [ 81 ] -// [ 48 ] -// [ 25 ] -// [ 11 ] -// [ 4 ] -template <> -class GaussianBlur { - public: - using SourceType = uint8_t; - using BufferType = uint32_t; - using DestinationType = uint8_t; - - explicit GaussianBlur([[maybe_unused]] float sigma) - KLEIDICV_STREAMING_COMPATIBLE {} - - // Applies vertical filtering vector using SIMD operations. - // - // DST = [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * - // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T - void vertical_vector_path( - svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, - svuint8_t src_3, svuint8_t src_4, svuint8_t src_5, svuint8_t src_6, - svuint8_t src_7, svuint8_t src_8, svuint8_t src_9, svuint8_t src_10, - svuint8_t src_11, svuint8_t src_12, svuint8_t src_13, svuint8_t src_14, - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svuint16_t acc_7_b = svmovlb_u16(src_7); - svuint16_t acc_7_t = svmovlt_u16(src_7); - - svuint16_t acc_1_13_b = svaddlb_u16(src_1, src_13); - svuint16_t acc_1_13_t = svaddlt_u16(src_1, src_13); - - svuint16_t acc_2_12_b = svaddlb_u16(src_2, src_12); - svuint16_t acc_2_12_t = svaddlt_u16(src_2, src_12); - - svuint16_t acc_6_8_b = svaddlb_u16(src_6, src_8); - svuint16_t acc_6_8_t = svaddlt_u16(src_6, src_8); - - svuint16_t acc_5_9_b = svaddlb_u16(src_5, src_9); - svuint16_t acc_5_9_t = svaddlt_u16(src_5, src_9); - - svuint16_t acc_0_14_b = svaddlb_u16(src_0, src_14); - svuint16_t acc_0_14_t = svaddlt_u16(src_0, src_14); - - svuint16_t acc_3_11_b = svaddlb_u16(src_3, src_11); - svuint16_t acc_3_11_t = svaddlt_u16(src_3, src_11); - - svuint16_t acc_4_10_b = svaddlb_u16(src_4, src_10); - svuint16_t acc_4_10_t = svaddlt_u16(src_4, src_10); - - acc_0_14_b = svlsl_n_u16_x(pg, acc_0_14_b, 2); - acc_0_14_t = svlsl_n_u16_x(pg, acc_0_14_t, 2); - - acc_3_11_b = svlsl_n_u16_x(pg, acc_3_11_b, 2); - acc_3_11_t = svlsl_n_u16_x(pg, acc_3_11_t, 2); - - acc_4_10_b = svmul_n_u16_x(pg, acc_4_10_b, 81); - acc_4_10_t = svmul_n_u16_x(pg, acc_4_10_t, 81); - - svuint16_t acc_1_3_11_13_b = svadd_u16_x(pg, acc_3_11_b, acc_1_13_b); - svuint16_t acc_1_3_11_13_t = svadd_u16_x(pg, acc_3_11_t, acc_1_13_t); - acc_1_3_11_13_b = svmla_n_u16_x(pg, acc_3_11_b, acc_1_3_11_13_b, 11); - acc_1_3_11_13_t = svmla_n_u16_x(pg, acc_3_11_t, acc_1_3_11_13_t, 11); - - svuint16_t acc_0_1_3_11_13_14_b = - svadd_u16_x(pg, acc_1_3_11_13_b, acc_0_14_b); - svuint16_t acc_0_1_3_11_13_14_t = - svadd_u16_x(pg, acc_1_3_11_13_t, acc_0_14_t); - - svuint16_t acc_2_4_10_12_b = svmla_n_u16_x(pg, acc_4_10_b, acc_2_12_b, 25); - svuint16_t acc_2_4_10_12_t = svmla_n_u16_x(pg, acc_4_10_t, acc_2_12_t, 25); - - svuint32_t acc_b_b = svaddlb_u32(acc_2_4_10_12_b, acc_0_1_3_11_13_14_b); - svuint32_t acc_b_t = svaddlb_u32(acc_2_4_10_12_t, acc_0_1_3_11_13_14_t); - svuint32_t acc_t_b = svaddlt_u32(acc_2_4_10_12_b, acc_0_1_3_11_13_14_b); - svuint32_t acc_t_t = svaddlt_u32(acc_2_4_10_12_t, acc_0_1_3_11_13_14_t); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_6_8_b, 146); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_6_8_t, 146); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_6_8_b, 146); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_6_8_t, 146); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_5_9_b, 118); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_5_9_t, 118); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_5_9_b, 118); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_5_9_t, 118); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_7_b, 158); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_7_t, 158); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_7_b, 158); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_7_t, 158); - - svuint32x4_t interleaved = - svcreate4_u32(acc_b_b, acc_b_t, acc_t_b, acc_t_t); - svst4_u32(pg, &dst[0], interleaved); - } - - // Applies horizontal filtering vector using SIMD operations. - // - // DST = 1/1048576 * [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * - // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T - void horizontal_vector_path( - svbool_t pg, svuint32_t src_0, svuint32_t src_1, svuint32_t src_2, - svuint32_t src_3, svuint32_t src_4, svuint32_t src_5, svuint32_t src_6, - svuint32_t src_7, svuint32_t src_8, svuint32_t src_9, svuint32_t src_10, - svuint32_t src_11, svuint32_t src_12, svuint32_t src_13, - svuint32_t src_14, - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svuint32_t acc_1_13 = svadd_u32_x(pg, src_1, src_13); - svuint32_t acc_2_12 = svadd_u32_x(pg, src_2, src_12); - svuint32_t acc_6_8 = svadd_u32_x(pg, src_6, src_8); - svuint32_t acc_5_9 = svadd_u32_x(pg, src_5, src_9); - svuint32_t acc_0_14 = svadd_u32_x(pg, src_0, src_14); - svuint32_t acc_3_11 = svadd_u32_x(pg, src_3, src_11); - svuint32_t acc_4_10 = svadd_u32_x(pg, src_4, src_10); - - acc_0_14 = svlsl_n_u32_x(pg, acc_0_14, 2); - acc_3_11 = svlsl_n_u32_x(pg, acc_3_11, 2); - acc_4_10 = svmul_n_u32_x(pg, acc_4_10, 81); - - svuint32_t acc_1_3_11_13 = svadd_u32_x(pg, acc_3_11, acc_1_13); - acc_1_3_11_13 = svmla_n_u32_x(pg, acc_3_11, acc_1_3_11_13, 11); - svuint32_t acc_0_1_3_11_13_14 = svadd_u32_x(pg, acc_1_3_11_13, acc_0_14); - svuint32_t acc_2_4_10_12 = svmla_n_u32_x(pg, acc_4_10, acc_2_12, 25); - - svuint32_t acc = svadd_u32_x(pg, acc_2_4_10_12, acc_0_1_3_11_13_14); - acc = svmla_n_u32_x(pg, acc, acc_6_8, 146); - acc = svmla_n_u32_x(pg, acc, acc_5_9, 118); - acc = svmla_n_u32_x(pg, acc, src_7, 158); - acc = svrshr_n_u32_x(pg, acc, 20); - svst1b_u32(pg, &dst[0], acc); - } - - // Applies horizontal filtering vector using scalar operations. - // - // DST = 1/1048576 * [ SRC0, SRC1, SRC2, SRC3...SRC11, SRC12, SRC13, SRC14 ] * - // * [ 4, 11, 25, 48 ... 48, 25, 11, 4 ]T - void horizontal_scalar_path(const BufferType src[15], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = (static_cast(src[3]) + src[11]) * 4; - acc += (acc + src[1] + src[13]) * 11; - acc += (src[0] + src[14]) * 4 + (src[2] + src[12]) * 25 + - (src[4] + src[10]) * 81; - acc += (src[5] + src[9]) * 118 + (src[6] + src[8]) * 146 + src[7] * 158; - dst[0] = rounding_shift_right(acc, 20); - } -}; // end of class GaussianBlur - -template -class GaussianBlurNonBinomialBase; - +// CustomSigma variant template -class GaussianBlurNonBinomialBase { - protected: - explicit GaussianBlurNonBinomialBase(float sigma) - KLEIDICV_STREAMING_COMPATIBLE - : half_kernel_( - generate_gaussian_half_kernel( - sigma)) {} - - const std::array half_kernel_; -}; - -template <> -class GaussianBlur final - : public GaussianBlurNonBinomialBase { - public: - using SourceType = uint8_t; - using BufferType = uint32_t; - using DestinationType = uint8_t; - - explicit GaussianBlur(float sigma) KLEIDICV_STREAMING_COMPATIBLE - : GaussianBlurNonBinomialBase(sigma) {} - - void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, - svuint8_t src_2, BufferType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - // 1 - svuint16_t acc_1_b = svmovlb_u16(src_1); - svuint16_t acc_1_t = svmovlt_u16(src_1); - - svuint32_t acc_b_b = svmullb_n_u32(acc_1_b, half_kernel_[1]); - svuint32_t acc_b_t = svmullb_n_u32(acc_1_t, half_kernel_[1]); - svuint32_t acc_t_b = svmullt_n_u32(acc_1_b, half_kernel_[1]); - svuint32_t acc_t_t = svmullt_n_u32(acc_1_t, half_kernel_[1]); - - // 0 - 2 - svuint16_t acc_0_2_b = svaddlb_u16(src_0, src_2); - svuint16_t acc_0_2_t = svaddlt_u16(src_0, src_2); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_0_2_b, half_kernel_[0]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_0_2_t, half_kernel_[0]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_0_2_b, half_kernel_[0]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_0_2_t, half_kernel_[0]); - - svuint32x4_t interleaved = svcreate4(acc_b_b, acc_b_t, acc_t_b, acc_t_t); - svst4(pg, &dst[0], interleaved); - } - - void horizontal_vector_path(svbool_t pg, svuint32_t src_0, svuint32_t src_1, - svuint32_t src_2, DestinationType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - // 1 - svuint32_t acc = svmul_n_u32_x(pg, src_1, half_kernel_[1]); - - // 0 - 2 - svuint32_t acc_0_2 = svadd_u32_x(pg, src_0, src_2); - acc = svmla_n_u32_x(pg, acc, acc_0_2, half_kernel_[0]); - - acc = svrshr_n_u32_x(pg, acc, 16); - svst1b_u32(pg, &dst[0], acc); - } - - void horizontal_scalar_path(const BufferType src[3], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = src[0] * half_kernel_[0] + src[1] * half_kernel_[1] + - src[2] * half_kernel_[0]; - dst[0] = static_cast(rounding_shift_right(acc, 16)); - } -}; // end of class GaussianBlur - -template <> -class GaussianBlur final - : public GaussianBlurNonBinomialBase { +class GaussianBlur { public: using SourceType = uint8_t; - using BufferType = uint32_t; + using BufferType = uint8_t; using DestinationType = uint8_t; + using SourceVecTraits = + typename ::KLEIDICV_TARGET_NAMESPACE::VecTraits; + using SourceVectorType = typename SourceVecTraits::VectorType; - explicit GaussianBlur(float sigma) KLEIDICV_STREAMING_COMPATIBLE - : GaussianBlurNonBinomialBase(sigma) {} - - void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, - svuint8_t src_2, svuint8_t src_3, svuint8_t src_4, - BufferType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - // 2 - svuint16_t acc_2_b = svmovlb_u16(src_2); - svuint16_t acc_2_t = svmovlt_u16(src_2); - - svuint32_t acc_b_b = svmullb_n_u32(acc_2_b, half_kernel_[2]); - svuint32_t acc_b_t = svmullb_n_u32(acc_2_t, half_kernel_[2]); - svuint32_t acc_t_b = svmullt_n_u32(acc_2_b, half_kernel_[2]); - svuint32_t acc_t_t = svmullt_n_u32(acc_2_t, half_kernel_[2]); - - // 1 - 3 - svuint16_t acc_1_3_b = svaddlb_u16(src_1, src_3); - svuint16_t acc_1_3_t = svaddlt_u16(src_1, src_3); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_1_3_b, half_kernel_[1]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_1_3_t, half_kernel_[1]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_1_3_b, half_kernel_[1]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_1_3_t, half_kernel_[1]); - - // 0 - 4 - svuint16_t acc_0_4_b = svaddlb_u16(src_0, src_4); - svuint16_t acc_0_4_t = svaddlt_u16(src_0, src_4); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_0_4_b, half_kernel_[0]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_0_4_t, half_kernel_[0]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_0_4_b, half_kernel_[0]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_0_4_t, half_kernel_[0]); - - svuint32x4_t interleaved = svcreate4(acc_b_b, acc_b_t, acc_t_b, acc_t_t); - svst4(pg, &dst[0], interleaved); - } - - void horizontal_vector_path(svbool_t pg, svuint32_t src_0, svuint32_t src_1, - svuint32_t src_2, svuint32_t src_3, - svuint32_t src_4, DestinationType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - // 2 - svuint32_t acc = svmul_n_u32_x(pg, src_2, half_kernel_[2]); - - // 1 - 3 - svuint32_t acc_1_3 = svadd_u32_x(pg, src_1, src_3); - acc = svmla_n_u32_x(pg, acc, acc_1_3, half_kernel_[1]); - - // 0 - 4 - svuint32_t acc_0_4 = svadd_u32_x(pg, src_0, src_4); - acc = svmla_n_u32_x(pg, acc, acc_0_4, half_kernel_[0]); - - acc = svrshr_n_u32_x(pg, acc, 16); - svst1b_u32(pg, &dst[0], acc); - } - - void horizontal_scalar_path(const BufferType src[5], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = src[0] * half_kernel_[0] + src[1] * half_kernel_[1] + - src[2] * half_kernel_[2] + src[3] * half_kernel_[1] + - src[4] * half_kernel_[0]; - dst[0] = static_cast(rounding_shift_right(acc, 16)); - } -}; // end of class GaussianBlur - -template <> -class GaussianBlur final - : public GaussianBlurNonBinomialBase { - public: - using SourceType = uint8_t; - using BufferType = uint32_t; - using DestinationType = uint8_t; + static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); - explicit GaussianBlur(float sigma) KLEIDICV_STREAMING_COMPATIBLE - : GaussianBlurNonBinomialBase(sigma) {} + explicit GaussianBlur(const std::array half_kernel) + : half_kernel_(half_kernel) {} void vertical_vector_path( - svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, - svuint8_t src_3, svuint8_t src_4, svuint8_t src_5, svuint8_t src_6, + svbool_t pg, std::reference_wrapper src[KernelSize], BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - // 3 - svuint16_t acc_3_b = svmovlb_u16(src_3); - svuint16_t acc_3_t = svmovlt_u16(src_3); - - svuint32_t acc_b_b = svmullb_n_u32(acc_3_b, half_kernel_[3]); - svuint32_t acc_b_t = svmullb_n_u32(acc_3_t, half_kernel_[3]); - svuint32_t acc_t_b = svmullt_n_u32(acc_3_b, half_kernel_[3]); - svuint32_t acc_t_t = svmullt_n_u32(acc_3_t, half_kernel_[3]); - - // 2 - 4 - svuint16_t acc_2_4_b = svaddlb_u16(src_2, src_4); - svuint16_t acc_2_4_t = svaddlt_u16(src_2, src_4); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_2_4_b, half_kernel_[2]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_2_4_t, half_kernel_[2]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_2_4_b, half_kernel_[2]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_2_4_t, half_kernel_[2]); - - // 1 - 5 - svuint16_t acc_1_5_b = svaddlb_u16(src_1, src_5); - svuint16_t acc_1_5_t = svaddlt_u16(src_1, src_5); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_1_5_b, half_kernel_[1]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_1_5_t, half_kernel_[1]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_1_5_b, half_kernel_[1]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_1_5_t, half_kernel_[1]); - - // 0 - 6 - svuint16_t acc_0_6_b = svaddlb_u16(src_0, src_6); - svuint16_t acc_0_6_t = svaddlt_u16(src_0, src_6); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_0_6_b, half_kernel_[0]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_0_6_t, half_kernel_[0]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_0_6_b, half_kernel_[0]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_0_6_t, half_kernel_[0]); - - svuint32x4_t interleaved = svcreate4(acc_b_b, acc_b_t, acc_t_b, acc_t_t); - svst4(pg, &dst[0], interleaved); + common_vector_path(pg, src, dst); } - void horizontal_vector_path( - svbool_t pg, svuint32_t src_0, svuint32_t src_1, svuint32_t src_2, - svuint32_t src_3, svuint32_t src_4, svuint32_t src_5, svuint32_t src_6, - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - // 3 - svuint32_t acc = svmul_n_u32_x(pg, src_3, half_kernel_[3]); - - // 2 - 4 - svuint32_t acc_2_4 = svadd_u32_x(pg, src_2, src_4); - acc = svmla_n_u32_x(pg, acc, acc_2_4, half_kernel_[2]); - - // 1 - 5 - svuint32_t acc_1_5 = svadd_u32_x(pg, src_1, src_5); - acc = svmla_n_u32_x(pg, acc, acc_1_5, half_kernel_[1]); - - // 0 - 6 - svuint32_t acc_0_6 = svadd_u32_x(pg, src_0, src_6); - acc = svmla_n_u32_x(pg, acc, acc_0_6, half_kernel_[0]); - - acc = svrshr_n_u32_x(pg, acc, 16); - svst1b_u32(pg, &dst[0], acc); - } - - void horizontal_scalar_path(const BufferType src[7], DestinationType *dst) + void vertical_scalar_path(const SourceType src[KernelSize], BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = src[0] * half_kernel_[0] + src[1] * half_kernel_[1] + - src[2] * half_kernel_[2] + src[3] * half_kernel_[3] + - src[4] * half_kernel_[2] + src[5] * half_kernel_[1] + - src[6] * half_kernel_[0]; - dst[0] = static_cast(rounding_shift_right(acc, 16)); - } -}; // end of class GaussianBlur + uint32_t acc = static_cast(src[kHalfKernelSize - 1]) * + half_kernel_[kHalfKernelSize - 1]; -template <> -class GaussianBlur final - : public GaussianBlurNonBinomialBase { - public: - using SourceType = uint8_t; - using BufferType = uint32_t; - using DestinationType = uint8_t; - - explicit GaussianBlur(float sigma) KLEIDICV_STREAMING_COMPATIBLE - : GaussianBlurNonBinomialBase(sigma) {} + // Optimization to avoid unnecessary branching in vector code. + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t i = 0; i < kHalfKernelSize - 1; i++) { + acc += (static_cast(src[i]) + + static_cast(src[KernelSize - i - 1])) * + half_kernel_[i]; + } - void vertical_vector_path( - svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, - svuint8_t src_3, svuint8_t src_4, svuint8_t src_5, svuint8_t src_6, - svuint8_t src_7, svuint8_t src_8, svuint8_t src_9, svuint8_t src_10, - svuint8_t src_11, svuint8_t src_12, svuint8_t src_13, svuint8_t src_14, - BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - // 7 - svuint16_t acc_7_b = svmovlb_u16(src_7); - svuint16_t acc_7_t = svmovlt_u16(src_7); - - svuint32_t acc_b_b = svmullb_n_u32(acc_7_b, half_kernel_[7]); - svuint32_t acc_b_t = svmullb_n_u32(acc_7_t, half_kernel_[7]); - svuint32_t acc_t_b = svmullt_n_u32(acc_7_b, half_kernel_[7]); - svuint32_t acc_t_t = svmullt_n_u32(acc_7_t, half_kernel_[7]); - - // 6 - 8 - svuint16_t acc_6_8_b = svaddlb_u16(src_6, src_8); - svuint16_t acc_6_8_t = svaddlt_u16(src_6, src_8); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_6_8_b, half_kernel_[6]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_6_8_t, half_kernel_[6]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_6_8_b, half_kernel_[6]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_6_8_t, half_kernel_[6]); - - // 5 - 9 - svuint16_t acc_5_9_b = svaddlb_u16(src_5, src_9); - svuint16_t acc_5_9_t = svaddlt_u16(src_5, src_9); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_5_9_b, half_kernel_[5]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_5_9_t, half_kernel_[5]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_5_9_b, half_kernel_[5]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_5_9_t, half_kernel_[5]); - - // 4 - 10 - svuint16_t acc_4_10_b = svaddlb_u16(src_4, src_10); - svuint16_t acc_4_10_t = svaddlt_u16(src_4, src_10); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_4_10_b, half_kernel_[4]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_4_10_t, half_kernel_[4]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_4_10_b, half_kernel_[4]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_4_10_t, half_kernel_[4]); - - // 3 - 11 - svuint16_t acc_3_11_b = svaddlb_u16(src_3, src_11); - svuint16_t acc_3_11_t = svaddlt_u16(src_3, src_11); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_3_11_b, half_kernel_[3]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_3_11_t, half_kernel_[3]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_3_11_b, half_kernel_[3]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_3_11_t, half_kernel_[3]); - - // 2 - 12 - svuint16_t acc_2_12_b = svaddlb_u16(src_2, src_12); - svuint16_t acc_2_12_t = svaddlt_u16(src_2, src_12); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_2_12_b, half_kernel_[2]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_2_12_t, half_kernel_[2]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_2_12_b, half_kernel_[2]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_2_12_t, half_kernel_[2]); - - // 1 - 13 - svuint16_t acc_1_13_b = svaddlb_u16(src_1, src_13); - svuint16_t acc_1_13_t = svaddlt_u16(src_1, src_13); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_1_13_b, half_kernel_[1]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_1_13_t, half_kernel_[1]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_1_13_b, half_kernel_[1]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_1_13_t, half_kernel_[1]); - - // 0 - 14 - svuint16_t acc_0_14_b = svaddlb_u16(src_0, src_14); - svuint16_t acc_0_14_t = svaddlt_u16(src_0, src_14); - - acc_b_b = svmlalb_n_u32(acc_b_b, acc_0_14_b, half_kernel_[0]); - acc_b_t = svmlalb_n_u32(acc_b_t, acc_0_14_t, half_kernel_[0]); - acc_t_b = svmlalt_n_u32(acc_t_b, acc_0_14_b, half_kernel_[0]); - acc_t_t = svmlalt_n_u32(acc_t_t, acc_0_14_t, half_kernel_[0]); - - svuint32x4_t interleaved = svcreate4(acc_b_b, acc_b_t, acc_t_b, acc_t_t); - svst4(pg, &dst[0], interleaved); + dst[0] = static_cast(rounding_shift_right(acc, 8)); } void horizontal_vector_path( - svbool_t pg, svuint32_t src_0, svuint32_t src_1, svuint32_t src_2, - svuint32_t src_3, svuint32_t src_4, svuint32_t src_5, svuint32_t src_6, - svuint32_t src_7, svuint32_t src_8, svuint32_t src_9, svuint32_t src_10, - svuint32_t src_11, svuint32_t src_12, svuint32_t src_13, - svuint32_t src_14, - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - // 7 - svuint32_t acc = svmul_n_u32_x(pg, src_7, half_kernel_[7]); - - // 6 - 8 - svuint32_t acc_6_8 = svadd_u32_x(pg, src_6, src_8); - acc = svmla_n_u32_x(pg, acc, acc_6_8, half_kernel_[6]); - - // 5 - 9 - svuint32_t acc_5_9 = svadd_u32_x(pg, src_5, src_9); - acc = svmla_n_u32_x(pg, acc, acc_5_9, half_kernel_[5]); - - // 4 - 10 - svuint32_t acc_4_10 = svadd_u32_x(pg, src_4, src_10); - acc = svmla_n_u32_x(pg, acc, acc_4_10, half_kernel_[4]); - - // 3 - 11 - svuint32_t acc_3_11 = svadd_u32_x(pg, src_3, src_11); - acc = svmla_n_u32_x(pg, acc, acc_3_11, half_kernel_[3]); - - // 2 - 12 - svuint32_t acc_2_12 = svadd_u32_x(pg, src_2, src_12); - acc = svmla_n_u32_x(pg, acc, acc_2_12, half_kernel_[2]); - - // 1 - 13 - svuint32_t acc_1_13 = svadd_u32_x(pg, src_1, src_13); - acc = svmla_n_u32_x(pg, acc, acc_1_13, half_kernel_[1]); - - // 0 - 14 - svuint32_t acc_0_14 = svadd_u32_x(pg, src_0, src_14); - acc = svmla_n_u32_x(pg, acc, acc_0_14, half_kernel_[0]); - - acc = svrshr_n_u32_x(pg, acc, 16); - svst1b_u32(pg, &dst[0], acc); - } - - void horizontal_scalar_path(const BufferType src[15], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = src[0] * half_kernel_[0] + src[1] * half_kernel_[1] + - src[2] * half_kernel_[2] + src[3] * half_kernel_[3] + - src[4] * half_kernel_[4] + src[5] * half_kernel_[5] + - src[6] * half_kernel_[6] + src[7] * half_kernel_[7] + - src[8] * half_kernel_[6] + src[9] * half_kernel_[5] + - src[10] * half_kernel_[4] + src[11] * half_kernel_[3] + - src[12] * half_kernel_[2] + src[13] * half_kernel_[1] + - src[14] * half_kernel_[0]; - dst[0] = static_cast(rounding_shift_right(acc, 16)); - } -}; // end of class GaussianBlur - -template <> -class GaussianBlur final - : public GaussianBlurNonBinomialBase { - public: - using SourceType = uint8_t; - using BufferType = uint32_t; - using DestinationType = uint8_t; - - explicit GaussianBlur(float sigma) KLEIDICV_STREAMING_COMPATIBLE - : GaussianBlurNonBinomialBase(sigma) {} - - void vertical_vector_path( - svbool_t pg, svuint8_t src_0, svuint8_t src_1, svuint8_t src_2, - svuint8_t src_3, svuint8_t src_4, svuint8_t src_5, svuint8_t src_6, - svuint8_t src_7, svuint8_t src_8, svuint8_t src_9, svuint8_t src_10, - svuint8_t src_11, svuint8_t src_12, svuint8_t src_13, svuint8_t src_14, - svuint8_t src_15, svuint8_t src_16, svuint8_t src_17, svuint8_t src_18, - svuint8_t src_19, svuint8_t src_20, + svbool_t pg, std::reference_wrapper src[KernelSize], BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svbool_t pg16all = svptrue_b16(); - - // (10) + (9 + 11) - // Need to calculate them in 32 bits, for small sigmas they can be large - svuint16_t acc_10_0 = svmovlb_u16(src_10); - svuint16_t acc_10_1 = svmovlt_u16(src_10); - - svuint32_t acc3_00 = svmullb_n_u32(acc_10_0, half_kernel_[10]); - svuint32_t acc3_10 = svmullt_n_u32(acc_10_0, half_kernel_[10]); - svuint32_t acc3_01 = svmullb_n_u32(acc_10_1, half_kernel_[10]); - svuint32_t acc3_11 = svmullt_n_u32(acc_10_1, half_kernel_[10]); - - svuint16_t acc_9_0 = svaddlb_u16(src_9, src_11); - svuint16_t acc_9_1 = svaddlt_u16(src_9, src_11); - - acc3_00 = svmlalb_n_u32(acc3_00, acc_9_0, half_kernel_[9]); - acc3_10 = svmlalt_n_u32(acc3_10, acc_9_0, half_kernel_[9]); - acc3_01 = svmlalb_n_u32(acc3_01, acc_9_1, half_kernel_[9]); - acc3_11 = svmlalt_n_u32(acc3_11, acc_9_1, half_kernel_[9]); - - // (8 + 12) + (7 + 13) + (6 + 14) - // 16bits are enough for these products, for any sigma - svuint16_t acc_8_0 = svaddlb_u16(src_8, src_12); - svuint16_t acc_8_1 = svaddlt_u16(src_8, src_12); - - svuint16_t mul8_0 = svmul_n_u16_x(pg16all, acc_8_0, half_kernel_[8]); - svuint16_t mul8_1 = svmul_n_u16_x(pg16all, acc_8_1, half_kernel_[8]); - - svuint16_t acc_7_0 = svaddlb_u16(src_7, src_13); - svuint16_t acc_7_1 = svaddlt_u16(src_7, src_13); - - svuint16_t mul7_0 = svmul_n_u16_x(pg16all, acc_7_0, half_kernel_[7]); - svuint16_t mul7_1 = svmul_n_u16_x(pg16all, acc_7_1, half_kernel_[7]); - - svuint16_t acc_6_0 = svaddlb_u16(src_6, src_14); - svuint16_t acc_6_1 = svaddlt_u16(src_6, src_14); - - svuint16_t mul6_0 = svmul_n_u16_x(pg16all, acc_6_0, half_kernel_[6]); - svuint16_t mul6_1 = svmul_n_u16_x(pg16all, acc_6_1, half_kernel_[6]); - - svuint32_t acc2_00 = svaddlb_u32(mul6_0, mul7_0); - svuint32_t acc2_10 = svaddlt_u32(mul6_0, mul7_0); - svuint32_t acc2_01 = svaddlb_u32(mul6_1, mul7_1); - svuint32_t acc2_11 = svaddlt_u32(mul6_1, mul7_1); - - svbool_t pg32all = svptrue_b32(); - acc2_00 = svadd_u32_x(pg32all, acc2_00, svmovlb_u32(mul8_0)); - acc2_10 = svadd_u32_x(pg32all, acc2_10, svmovlt_u32(mul8_0)); - acc2_01 = svadd_u32_x(pg32all, acc2_01, svmovlb_u32(mul8_1)); - acc2_11 = svadd_u32_x(pg32all, acc2_11, svmovlt_u32(mul8_1)); - - // (5 + 15) + (4 + 14) + (3 + 17) - // these fit into 16 bits together with acc0 too, we can save some cycles - svuint16_t acc_5_0 = svaddlb_u16(src_5, src_15); - svuint16_t acc_5_1 = svaddlt_u16(src_5, src_15); - - svuint16_t acc1_0 = svmul_n_u16_x(pg16all, acc_5_0, half_kernel_[5]); - svuint16_t acc1_1 = svmul_n_u16_x(pg16all, acc_5_1, half_kernel_[5]); - - svuint16_t acc_4_0 = svaddlb_u16(src_4, src_16); - svuint16_t acc_4_1 = svaddlt_u16(src_4, src_16); - - acc1_0 = svmla_n_u16_x(pg16all, acc1_0, acc_4_0, half_kernel_[4]); - acc1_1 = svmla_n_u16_x(pg16all, acc1_1, acc_4_1, half_kernel_[4]); - - svuint16_t acc_3_0 = svaddlb_u16(src_3, src_17); - svuint16_t acc_3_1 = svaddlt_u16(src_3, src_17); - - acc1_0 = svmla_n_u16_x(pg16all, acc1_0, acc_3_0, half_kernel_[3]); - acc1_1 = svmla_n_u16_x(pg16all, acc1_1, acc_3_1, half_kernel_[3]); - - // (2 + 18) + (1 + 19) + (0 + 20) - // these fit into 16 bits together with acc1 too, we can save some cycles - svuint16_t acc_2_0 = svaddlb_u16(src_2, src_18); - svuint16_t acc_2_1 = svaddlt_u16(src_2, src_18); - - svuint16_t acc0_0 = svmul_n_u16_x(pg16all, acc_2_0, half_kernel_[2]); - svuint16_t acc0_1 = svmul_n_u16_x(pg16all, acc_2_1, half_kernel_[2]); - - svuint16_t acc_1_0 = svaddlb_u16(src_1, src_19); - svuint16_t acc_1_1 = svaddlt_u16(src_1, src_19); - - acc0_0 = svmla_n_u16_x(pg16all, acc0_0, acc_1_0, half_kernel_[1]); - acc0_1 = svmla_n_u16_x(pg16all, acc0_1, acc_1_1, half_kernel_[1]); - - svuint16_t acc_0_0 = svaddlb_u16(src_0, src_20); - svuint16_t acc_0_1 = svaddlt_u16(src_0, src_20); - - acc0_0 = svmla_n_u16_x(pg16all, acc0_0, acc_0_0, half_kernel_[0]); - acc0_1 = svmla_n_u16_x(pg16all, acc0_1, acc_0_1, half_kernel_[0]); - - // Sum them up - svuint32_t acc_second_00 = svadd_u32_x(pg32all, acc3_00, acc2_00); - svuint32_t acc_second_10 = svadd_u32_x(pg32all, acc3_10, acc2_10); - svuint32_t acc_second_01 = svadd_u32_x(pg32all, acc3_01, acc2_01); - svuint32_t acc_second_11 = svadd_u32_x(pg32all, acc3_11, acc2_11); - - svuint32_t acc_first_00 = svaddlb_u32(acc1_0, acc0_0); - svuint32_t acc_first_10 = svaddlt_u32(acc1_0, acc0_0); - svuint32_t acc_first_01 = svaddlb_u32(acc1_1, acc0_1); - svuint32_t acc_first_11 = svaddlt_u32(acc1_1, acc0_1); - - svuint32_t acc_00 = svadd_u32_x(pg32all, acc_first_00, acc_second_00); - svuint32_t acc_10 = svadd_u32_x(pg32all, acc_first_10, acc_second_10); - svuint32_t acc_01 = svadd_u32_x(pg32all, acc_first_01, acc_second_01); - svuint32_t acc_11 = svadd_u32_x(pg32all, acc_first_11, acc_second_11); - - svuint32x4_t interleaved = svcreate4(acc_00, acc_01, acc_10, acc_11); - svst4(pg, &dst[0], interleaved); + common_vector_path(pg, src, dst); } - void horizontal_vector_path( - svbool_t pg, svuint32_t src_0, svuint32_t src_1, svuint32_t src_2, - svuint32_t src_3, svuint32_t src_4, svuint32_t src_5, svuint32_t src_6, - svuint32_t src_7, svuint32_t src_8, svuint32_t src_9, svuint32_t src_10, - svuint32_t src_11, svuint32_t src_12, svuint32_t src_13, - svuint32_t src_14, svuint32_t src_15, svuint32_t src_16, - svuint32_t src_17, svuint32_t src_18, svuint32_t src_19, - svuint32_t src_20, - DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svuint32_t acc = svmul_n_u32_x(pg, src_10, half_kernel_[10]); - - svuint32_t acc_9_11 = svadd_u32_x(pg, src_9, src_11); - acc = svmla_n_u32_x(pg, acc, acc_9_11, half_kernel_[9]); - - svuint32_t acc_8_12 = svadd_u32_x(pg, src_8, src_12); - acc = svmla_n_u32_x(pg, acc, acc_8_12, half_kernel_[8]); - - svuint32_t acc_7_13 = svadd_u32_x(pg, src_7, src_13); - acc = svmla_n_u32_x(pg, acc, acc_7_13, half_kernel_[7]); - - svuint32_t acc_6_14 = svadd_u32_x(pg, src_6, src_14); - acc = svmla_n_u32_x(pg, acc, acc_6_14, half_kernel_[6]); - - svuint32_t acc_5_15 = svadd_u32_x(pg, src_5, src_15); - acc = svmla_n_u32_x(pg, acc, acc_5_15, half_kernel_[5]); - - svuint32_t acc_4_16 = svadd_u32_x(pg, src_4, src_16); - acc = svmla_n_u32_x(pg, acc, acc_4_16, half_kernel_[4]); - - svuint32_t acc_3_17 = svadd_u32_x(pg, src_3, src_17); - acc = svmla_n_u32_x(pg, acc, acc_3_17, half_kernel_[3]); - - svuint32_t acc_2_18 = svadd_u32_x(pg, src_2, src_18); - acc = svmla_n_u32_x(pg, acc, acc_2_18, half_kernel_[2]); - - svuint32_t acc_1_19 = svadd_u32_x(pg, src_1, src_19); - acc = svmla_n_u32_x(pg, acc, acc_1_19, half_kernel_[1]); - - svuint32_t acc_0_20 = svadd_u32_x(pg, src_0, src_20); - acc = svmla_n_u32_x(pg, acc, acc_0_20, half_kernel_[0]); - - acc = svrshr_n_u32_x(pg, acc, 16); - svst1b_u32(pg, &dst[0], acc); + void horizontal_scalar_path(const BufferType src[KernelSize], + DestinationType *dst) const + KLEIDICV_STREAMING_COMPATIBLE { + vertical_scalar_path(src, dst); } - void horizontal_scalar_path(const BufferType src[15], DestinationType *dst) - const KLEIDICV_STREAMING_COMPATIBLE { - uint32_t acc = (src[0] + src[20]) * half_kernel_[0] + - (src[1] + src[19]) * half_kernel_[1] + - (src[2] + src[18]) * half_kernel_[2] + - (src[3] + src[17]) * half_kernel_[3] + - (src[4] + src[16]) * half_kernel_[4] + - (src[5] + src[15]) * half_kernel_[5] + - (src[6] + src[14]) * half_kernel_[6] + - (src[7] + src[13]) * half_kernel_[7] + - (src[8] + src[12]) * half_kernel_[8] + - (src[9] + src[11]) * half_kernel_[9] + - src[10] * half_kernel_[10]; - dst[0] = static_cast(rounding_shift_right(acc, 16)); - } -}; // end of class GaussianBlur + private: + void common_vector_path( + svbool_t pg, std::reference_wrapper src[KernelSize], + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svbool_t pg16_all = svptrue_b16(); + svuint16_t acc_b = svmullb_n_u16(src[kHalfKernelSize - 1], + half_kernel_[kHalfKernelSize - 1]); + svuint16_t acc_t = svmullt_n_u16(src[kHalfKernelSize - 1], + half_kernel_[kHalfKernelSize - 1]); + + // Optimization to avoid unnecessary branching in vector code. + KLEIDICV_FORCE_LOOP_UNROLL + for (size_t i = 0; i < kHalfKernelSize - 1; i++) { + const size_t j = KernelSize - i - 1; + svuint16_t vec_b = svaddlb_u16(src[i], src[j]); + svuint16_t vec_t = svaddlt_u16(src[i], src[j]); + + acc_b = svmla_n_u16_x(pg16_all, acc_b, vec_b, half_kernel_[i]); + acc_t = svmla_n_u16_x(pg16_all, acc_t, vec_t, half_kernel_[i]); + } + + // Rounding before narrowing + acc_b = svqadd_n_u16(acc_b, 128); + acc_t = svqadd_n_u16(acc_t, 128); + // Keep only the highest 8 bits + svuint8_t result = + svtrn2_u8(svreinterpret_u8_u16(acc_b), svreinterpret_u8_u16(acc_t)); + svst1(pg, &dst[0], result); + } + + const std::array half_kernel_; +}; // end of class GaussianBlur template static kleidicv_error_t gaussian_blur_fixed_kernel_size( @@ -1012,15 +337,35 @@ static kleidicv_error_t gaussian_blur_fixed_kernel_size( SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { using GaussianBlurFilter = GaussianBlur; - GaussianBlurFilter blur{sigma}; - SeparableFilter filter{blur}; - Rows src_rows{src, src_stride, channels}; Rows dst_rows{dst, dst_stride, channels}; - workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, - border_type, filter); - return KLEIDICV_OK; + if constexpr (IsBinomial) { + GaussianBlurFilter blur; + SeparableFilter filter{blur}; + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, + border_type, filter); + + return KLEIDICV_OK; + } else { + constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); + auto half_kernel = generate_gaussian_half_kernel(sigma); + // If sigma is so small that the middle point gets all the weights, it's + // just a copy + if (half_kernel[kHalfKernelSize - 1] < 256) { + GaussianBlurFilter blur(half_kernel); + SeparableFilter filter{blur}; + workspace->process(rect, y_begin, y_end, src_rows, dst_rows, channels, + border_type, filter); + } else { + for (size_t row = y_begin; row < y_end; ++row) { + std::memcpy(static_cast(&dst_rows.at(row)[0]), + static_cast(&src_rows.at(row)[0]), + rect.width() * sizeof(ScalarType) * dst_rows.channels()); + } + } + return KLEIDICV_OK; + } } template @@ -1043,7 +388,8 @@ static kleidicv_error_t gaussian_blur( src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, sigma, border_type, workspace); case 15: - return gaussian_blur_fixed_kernel_size<15, IsBinomial>( + // 15x15 does not have a binomial variant + return gaussian_blur_fixed_kernel_size<15, false>( src, src_stride, dst, dst_stride, rect, y_begin, y_end, channels, sigma, border_type, workspace); case 21: diff --git a/kleidicv/src/filters/separable_filter_2d_sc.h b/kleidicv/src/filters/separable_filter_2d_sc.h index ad270c47576f525d575ec82eff06f41a3cb7a9f4..ceb125add32931c6eddf9143699090c3dee4ba5c 100644 --- a/kleidicv/src/filters/separable_filter_2d_sc.h +++ b/kleidicv/src/filters/separable_filter_2d_sc.h @@ -48,34 +48,33 @@ class SeparableFilter2D { kernel_y_4_u8_(kernel_y_4_u8) {} void vertical_vector_path( - svbool_t pg, SourceVectorType src_0, SourceVectorType src_1, - SourceVectorType src_2, SourceVectorType src_3, SourceVectorType src_4, + svbool_t pg, std::reference_wrapper src[5], BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { // 0 - BufferVectorType acc_b = svmullb_u16(src_0, kernel_y_0_u8_); - BufferVectorType acc_t = svmullt_u16(src_0, kernel_y_0_u8_); + BufferVectorType acc_b = svmullb_u16(src[0], kernel_y_0_u8_); + BufferVectorType acc_t = svmullt_u16(src[0], kernel_y_0_u8_); // 1 - BufferVectorType vec_b = svmullb_u16(src_1, kernel_y_1_u8_); - BufferVectorType vec_t = svmullt_u16(src_1, kernel_y_1_u8_); + BufferVectorType vec_b = svmullb_u16(src[1], kernel_y_1_u8_); + BufferVectorType vec_t = svmullt_u16(src[1], kernel_y_1_u8_); acc_b = svqadd_u16_x(pg, acc_b, vec_b); acc_t = svqadd_u16_x(pg, acc_t, vec_t); // 2 - vec_b = svmullb_u16(src_2, kernel_y_2_u8_); - vec_t = svmullt_u16(src_2, kernel_y_2_u8_); + vec_b = svmullb_u16(src[2], kernel_y_2_u8_); + vec_t = svmullt_u16(src[2], kernel_y_2_u8_); acc_b = svqadd_u16_x(pg, acc_b, vec_b); acc_t = svqadd_u16_x(pg, acc_t, vec_t); // 3 - vec_b = svmullb_u16(src_3, kernel_y_3_u8_); - vec_t = svmullt_u16(src_3, kernel_y_3_u8_); + vec_b = svmullb_u16(src[3], kernel_y_3_u8_); + vec_t = svmullt_u16(src[3], kernel_y_3_u8_); acc_b = svqadd_u16_x(pg, acc_b, vec_b); acc_t = svqadd_u16_x(pg, acc_t, vec_t); // 4 - vec_b = svmullb_u16(src_4, kernel_y_4_u8_); - vec_t = svmullt_u16(src_4, kernel_y_4_u8_); + vec_b = svmullb_u16(src[4], kernel_y_4_u8_); + vec_t = svmullt_u16(src[4], kernel_y_4_u8_); acc_b = svqadd_u16_x(pg, acc_b, vec_b); acc_t = svqadd_u16_x(pg, acc_t, vec_t); @@ -84,28 +83,27 @@ class SeparableFilter2D { } void horizontal_vector_path( - svbool_t pg, BufferVectorType src_0, BufferVectorType src_1, - BufferVectorType src_2, BufferVectorType src_3, BufferVectorType src_4, + svbool_t pg, std::reference_wrapper src[5], DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { // 0 - svuint32_t acc_b = svmullb_u32(src_0, kernel_x_0_u16_); - svuint32_t acc_t = svmullt_u32(src_0, kernel_x_0_u16_); + svuint32_t acc_b = svmullb_u32(src[0], kernel_x_0_u16_); + svuint32_t acc_t = svmullt_u32(src[0], kernel_x_0_u16_); // 1 - acc_b = svmlalb_u32(acc_b, src_1, kernel_x_1_u16_); - acc_t = svmlalt_u32(acc_t, src_1, kernel_x_1_u16_); + acc_b = svmlalb_u32(acc_b, src[1], kernel_x_1_u16_); + acc_t = svmlalt_u32(acc_t, src[1], kernel_x_1_u16_); // 2 - acc_b = svmlalb_u32(acc_b, src_2, kernel_x_2_u16_); - acc_t = svmlalt_u32(acc_t, src_2, kernel_x_2_u16_); + acc_b = svmlalb_u32(acc_b, src[2], kernel_x_2_u16_); + acc_t = svmlalt_u32(acc_t, src[2], kernel_x_2_u16_); // 3 - acc_b = svmlalb_u32(acc_b, src_3, kernel_x_3_u16_); - acc_t = svmlalt_u32(acc_t, src_3, kernel_x_3_u16_); + acc_b = svmlalb_u32(acc_b, src[3], kernel_x_3_u16_); + acc_t = svmlalt_u32(acc_t, src[3], kernel_x_3_u16_); // 4 - acc_b = svmlalb_u32(acc_b, src_4, kernel_x_4_u16_); - acc_t = svmlalt_u32(acc_t, src_4, kernel_x_4_u16_); + acc_b = svmlalb_u32(acc_b, src[4], kernel_x_4_u16_); + acc_t = svmlalt_u32(acc_t, src[4], kernel_x_4_u16_); svuint16_t acc_u16_b = svqxtnb_u32(acc_b); svuint16_t acc_u16 = svqxtnt_u32(acc_u16_b, acc_t); @@ -188,34 +186,33 @@ class SeparableFilter2D { kernel_y_4_u16_(kernel_y_4_u16) {} void vertical_vector_path( - svbool_t pg, SourceVectorType src_0, SourceVectorType src_1, - SourceVectorType src_2, SourceVectorType src_3, SourceVectorType src_4, + svbool_t pg, std::reference_wrapper src[5], BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { // 0 - BufferVectorType acc_b = svmullb_u32(src_0, kernel_y_0_u16_); - BufferVectorType acc_t = svmullt_u32(src_0, kernel_y_0_u16_); + BufferVectorType acc_b = svmullb_u32(src[0], kernel_y_0_u16_); + BufferVectorType acc_t = svmullt_u32(src[0], kernel_y_0_u16_); // 1 - BufferVectorType vec_b = svmullb_u32(src_1, kernel_y_1_u16_); - BufferVectorType vec_t = svmullt_u32(src_1, kernel_y_1_u16_); + BufferVectorType vec_b = svmullb_u32(src[1], kernel_y_1_u16_); + BufferVectorType vec_t = svmullt_u32(src[1], kernel_y_1_u16_); acc_b = svqadd_u32_x(pg, acc_b, vec_b); acc_t = svqadd_u32_x(pg, acc_t, vec_t); // 2 - vec_b = svmullb_u32(src_2, kernel_y_2_u16_); - vec_t = svmullt_u32(src_2, kernel_y_2_u16_); + vec_b = svmullb_u32(src[2], kernel_y_2_u16_); + vec_t = svmullt_u32(src[2], kernel_y_2_u16_); acc_b = svqadd_u32_x(pg, acc_b, vec_b); acc_t = svqadd_u32_x(pg, acc_t, vec_t); // 3 - vec_b = svmullb_u32(src_3, kernel_y_3_u16_); - vec_t = svmullt_u32(src_3, kernel_y_3_u16_); + vec_b = svmullb_u32(src[3], kernel_y_3_u16_); + vec_t = svmullt_u32(src[3], kernel_y_3_u16_); acc_b = svqadd_u32_x(pg, acc_b, vec_b); acc_t = svqadd_u32_x(pg, acc_t, vec_t); // 4 - vec_b = svmullb_u32(src_4, kernel_y_4_u16_); - vec_t = svmullt_u32(src_4, kernel_y_4_u16_); + vec_b = svmullb_u32(src[4], kernel_y_4_u16_); + vec_t = svmullt_u32(src[4], kernel_y_4_u16_); acc_b = svqadd_u32_x(pg, acc_b, vec_b); acc_t = svqadd_u32_x(pg, acc_t, vec_t); @@ -224,28 +221,27 @@ class SeparableFilter2D { } void horizontal_vector_path( - svbool_t pg, BufferVectorType src_0, BufferVectorType src_1, - BufferVectorType src_2, BufferVectorType src_3, BufferVectorType src_4, + svbool_t pg, std::reference_wrapper src[5], DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { // 0 - svuint64_t acc_b = svmullb_u64(src_0, kernel_x_0_u32_); - svuint64_t acc_t = svmullt_u64(src_0, kernel_x_0_u32_); + svuint64_t acc_b = svmullb_u64(src[0], kernel_x_0_u32_); + svuint64_t acc_t = svmullt_u64(src[0], kernel_x_0_u32_); // 1 - acc_b = svmlalb_u64(acc_b, src_1, kernel_x_1_u32_); - acc_t = svmlalt_u64(acc_t, src_1, kernel_x_1_u32_); + acc_b = svmlalb_u64(acc_b, src[1], kernel_x_1_u32_); + acc_t = svmlalt_u64(acc_t, src[1], kernel_x_1_u32_); // 2 - acc_b = svmlalb_u64(acc_b, src_2, kernel_x_2_u32_); - acc_t = svmlalt_u64(acc_t, src_2, kernel_x_2_u32_); + acc_b = svmlalb_u64(acc_b, src[2], kernel_x_2_u32_); + acc_t = svmlalt_u64(acc_t, src[2], kernel_x_2_u32_); // 3 - acc_b = svmlalb_u64(acc_b, src_3, kernel_x_3_u32_); - acc_t = svmlalt_u64(acc_t, src_3, kernel_x_3_u32_); + acc_b = svmlalb_u64(acc_b, src[3], kernel_x_3_u32_); + acc_t = svmlalt_u64(acc_t, src[3], kernel_x_3_u32_); // 4 - acc_b = svmlalb_u64(acc_b, src_4, kernel_x_4_u32_); - acc_t = svmlalt_u64(acc_t, src_4, kernel_x_4_u32_); + acc_b = svmlalb_u64(acc_b, src[4], kernel_x_4_u32_); + acc_t = svmlalt_u64(acc_t, src[4], kernel_x_4_u32_); svuint32_t acc_u32_b = svqxtnb_u64(acc_b); svuint32_t acc_u32 = svqxtnt_u64(acc_u32_b, acc_t); @@ -328,34 +324,33 @@ class SeparableFilter2D { kernel_y_4_s16_(kernel_y_4_s16) {} void vertical_vector_path( - svbool_t pg, SourceVectorType src_0, SourceVectorType src_1, - SourceVectorType src_2, SourceVectorType src_3, SourceVectorType src_4, + svbool_t pg, std::reference_wrapper src[5], BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { // 0 - BufferVectorType acc_b = svmullb_s32(src_0, kernel_y_0_s16_); - BufferVectorType acc_t = svmullt_s32(src_0, kernel_y_0_s16_); + BufferVectorType acc_b = svmullb_s32(src[0], kernel_y_0_s16_); + BufferVectorType acc_t = svmullt_s32(src[0], kernel_y_0_s16_); // 1 - BufferVectorType vec_b = svmullb_s32(src_1, kernel_y_1_s16_); - BufferVectorType vec_t = svmullt_s32(src_1, kernel_y_1_s16_); + BufferVectorType vec_b = svmullb_s32(src[1], kernel_y_1_s16_); + BufferVectorType vec_t = svmullt_s32(src[1], kernel_y_1_s16_); acc_b = svqadd_s32_x(pg, acc_b, vec_b); acc_t = svqadd_s32_x(pg, acc_t, vec_t); // 2 - vec_b = svmullb_s32(src_2, kernel_y_2_s16_); - vec_t = svmullt_s32(src_2, kernel_y_2_s16_); + vec_b = svmullb_s32(src[2], kernel_y_2_s16_); + vec_t = svmullt_s32(src[2], kernel_y_2_s16_); acc_b = svqadd_s32_x(pg, acc_b, vec_b); acc_t = svqadd_s32_x(pg, acc_t, vec_t); // 3 - vec_b = svmullb_s32(src_3, kernel_y_3_s16_); - vec_t = svmullt_s32(src_3, kernel_y_3_s16_); + vec_b = svmullb_s32(src[3], kernel_y_3_s16_); + vec_t = svmullt_s32(src[3], kernel_y_3_s16_); acc_b = svqadd_s32_x(pg, acc_b, vec_b); acc_t = svqadd_s32_x(pg, acc_t, vec_t); // 4 - vec_b = svmullb_s32(src_4, kernel_y_4_s16_); - vec_t = svmullt_s32(src_4, kernel_y_4_s16_); + vec_b = svmullb_s32(src[4], kernel_y_4_s16_); + vec_t = svmullt_s32(src[4], kernel_y_4_s16_); acc_b = svqadd_s32_x(pg, acc_b, vec_b); acc_t = svqadd_s32_x(pg, acc_t, vec_t); @@ -364,28 +359,27 @@ class SeparableFilter2D { } void horizontal_vector_path( - svbool_t pg, BufferVectorType src_0, BufferVectorType src_1, - BufferVectorType src_2, BufferVectorType src_3, BufferVectorType src_4, + svbool_t pg, std::reference_wrapper src[5], DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { // 0 - svint64_t acc_b = svmullb_s64(src_0, kernel_x_0_s32_); - svint64_t acc_t = svmullt_s64(src_0, kernel_x_0_s32_); + svint64_t acc_b = svmullb_s64(src[0], kernel_x_0_s32_); + svint64_t acc_t = svmullt_s64(src[0], kernel_x_0_s32_); // 1 - acc_b = svmlalb_s64(acc_b, src_1, kernel_x_1_s32_); - acc_t = svmlalt_s64(acc_t, src_1, kernel_x_1_s32_); + acc_b = svmlalb_s64(acc_b, src[1], kernel_x_1_s32_); + acc_t = svmlalt_s64(acc_t, src[1], kernel_x_1_s32_); // 2 - acc_b = svmlalb_s64(acc_b, src_2, kernel_x_2_s32_); - acc_t = svmlalt_s64(acc_t, src_2, kernel_x_2_s32_); + acc_b = svmlalb_s64(acc_b, src[2], kernel_x_2_s32_); + acc_t = svmlalt_s64(acc_t, src[2], kernel_x_2_s32_); // 3 - acc_b = svmlalb_s64(acc_b, src_3, kernel_x_3_s32_); - acc_t = svmlalt_s64(acc_t, src_3, kernel_x_3_s32_); + acc_b = svmlalb_s64(acc_b, src[3], kernel_x_3_s32_); + acc_t = svmlalt_s64(acc_t, src[3], kernel_x_3_s32_); // 4 - acc_b = svmlalb_s64(acc_b, src_4, kernel_x_4_s32_); - acc_t = svmlalt_s64(acc_t, src_4, kernel_x_4_s32_); + acc_b = svmlalb_s64(acc_b, src[4], kernel_x_4_s32_); + acc_t = svmlalt_s64(acc_t, src[4], kernel_x_4_s32_); svint32_t acc_s32_b = svqxtnb_s64(acc_b); svint32_t acc_s32 = svqxtnt_s64(acc_s32_b, acc_t); diff --git a/kleidicv/src/filters/sobel_sc.h b/kleidicv/src/filters/sobel_sc.h index 341c152d5166a9536852bc26e362c51e0c74d041..fc0a5d08f9b8fd8036b7c657c76fd491063c7431 100644 --- a/kleidicv/src/filters/sobel_sc.h +++ b/kleidicv/src/filters/sobel_sc.h @@ -34,13 +34,13 @@ class HorizontalSobel3x3 { // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t src_1, - svuint8_t src_2, BufferType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - svuint16_t acc_u16_b = svaddlb(src_0, src_2); - svuint16_t acc_u16_t = svaddlt(src_0, src_2); - acc_u16_b = svmlalb(acc_u16_b, src_1, svdup_n_u8(2)); - acc_u16_t = svmlalt(acc_u16_t, src_1, svdup_n_u8(2)); + void vertical_vector_path( + svbool_t pg, std::reference_wrapper src[3], + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t acc_u16_b = svaddlb(src[0], src[2]); + svuint16_t acc_u16_t = svaddlt(src[0], src[2]); + acc_u16_b = svmlalb(acc_u16_b, src[1], svdup_n_u8(2)); + acc_u16_t = svmlalt(acc_u16_t, src[1], svdup_n_u8(2)); svint16x2_t interleaved = svcreate2(svreinterpret_s16(acc_u16_b), svreinterpret_s16(acc_u16_t)); @@ -51,9 +51,9 @@ class HorizontalSobel3x3 { // // DST = [ SRC0, SRC1, SRC2 ] * [ -1, 0, 1 ]T void horizontal_vector_path( - svbool_t pg, svint16_t src_0, svint16_t /* src_1 */, svint16_t src_2, + svbool_t pg, std::reference_wrapper src[3], DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { - svst1(pg, &dst[0], svsub_x(pg, src_2, src_0)); + svst1(pg, &dst[0], svsub_x(pg, src[2], src[0])); } // Applies horizontal filtering vector using scalar operations. @@ -87,11 +87,11 @@ class VerticalSobel3x3 { // Applies vertical filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ -1, 0, 1 ]T - void vertical_vector_path(svbool_t pg, svuint8_t src_0, svuint8_t /* src_1 */, - svuint8_t src_2, BufferType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - svuint16_t acc_u16_b = svsublb(src_2, src_0); - svuint16_t acc_u16_t = svsublt(src_2, src_0); + void vertical_vector_path( + svbool_t pg, std::reference_wrapper src[3], + BufferType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svuint16_t acc_u16_b = svsublb(src[2], src[0]); + svuint16_t acc_u16_t = svsublt(src[2], src[0]); svint16x2_t interleaved = svcreate2(svreinterpret_s16(acc_u16_b), svreinterpret_s16(acc_u16_t)); @@ -101,11 +101,11 @@ class VerticalSobel3x3 { // Applies horizontal filtering vector using SIMD operations. // // DST = [ SRC0, SRC1, SRC2 ] * [ 1, 2, 1 ]T - void horizontal_vector_path(svbool_t pg, svint16_t src_0, svint16_t src_1, - svint16_t src_2, DestinationType *dst) const - KLEIDICV_STREAMING_COMPATIBLE { - svint16_t acc = svadd_x(pg, src_0, src_2); - acc = svmad_s16_x(pg, src_1, svdup_n_s16(2), acc); + void horizontal_vector_path( + svbool_t pg, std::reference_wrapper src[3], + DestinationType *dst) const KLEIDICV_STREAMING_COMPATIBLE { + svint16_t acc = svadd_x(pg, src[0], src[2]); + acc = svmad_s16_x(pg, src[1], svdup_n_s16(2), acc); svst1(pg, &dst[0], acc); } diff --git a/scripts/benchmark/benchmarks.txt b/scripts/benchmark/benchmarks.txt index a28ef3a9a6adbc041eb4ab9f5f4f9c6356b205a4..3e2024fed3d1c111ceb522e9883ee78e0705ed7e 100755 --- a/scripts/benchmark/benchmarks.txt +++ b/scripts/benchmark/benchmarks.txt @@ -38,7 +38,6 @@ MedianBlur7x7: opencv_perf_imgproc '*medianBlur/*' '($PIXEL_FORMAT, 8UC1, 7) GaussianBlur3x3: opencv_perf_imgproc '*gaussianBlur3x3/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur5x5: opencv_perf_imgproc '*gaussianBlur5x5/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur7x7: opencv_perf_imgproc '*gaussianBlur7x7/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' -GaussianBlur15x15: opencv_perf_imgproc '*gaussianBlur15x15/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur3x3_CustomSigma: opencv_perf_imgproc '*gaussianBlur3x3_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' GaussianBlur5x5_CustomSigma: opencv_perf_imgproc '*gaussianBlur5x5_CustomSigma/*' '($PIXEL_FORMAT, 8UC1, BORDER_REPLICATE)' diff --git a/test/api/test_gaussian_blur.cpp b/test/api/test_gaussian_blur.cpp index 6c54487fc76e997d31416b4cf53ae672395370d6..0fc515dcebcb9b01e7b56ce46afc42a6858ab17f 100644 --- a/test/api/test_gaussian_blur.cpp +++ b/test/api/test_gaussian_blur.cpp @@ -39,6 +39,8 @@ static constexpr std::array kAllBorders = { KLEIDICV_BORDER_TYPE_REVERSE, }; +static constexpr size_t kToleranceOne = 1; + // Test for GaussianBlur operator. template { using typename test::KernelTest::OutputType; using ArrayContainerType = std::invoke_result_t; + static constexpr size_t kKernelSize = KernelTestParams::kKernelSize; public: explicit GaussianBlurTest( KernelTestParams, ArrayLayoutsGetterType array_layouts_getter = test::small_array_layouts, - BorderContainerType border_types = kAllBorders) - : array_layouts_{array_layouts_getter(KernelTestParams::kKernelSize - 1, + BorderContainerType border_types = kAllBorders, size_t tolerance = 0) + : test::KernelTest{tolerance}, + array_layouts_{array_layouts_getter(KernelTestParams::kKernelSize - 1, KernelTestParams::kKernelSize - 1)}, border_types_{border_types}, array_layout_generator_{array_layouts_}, - border_type_generator_{border_types_} {} + border_type_generator_{border_types_}, + sigma_{0.0} {} + + GaussianBlurTest &with_sigma(float sigma) { + sigma_ = sigma; + return *this; + } void test(const test::Array2D &mask) { test::Kernel kernel{mask}; @@ -72,6 +82,27 @@ class GaussianBlurTest : public test::KernelTest { tested_border_values, element_generator); } + void test_with_generated_mask() { + test::Array2D mask{kKernelSize, kKernelSize}; + calculate_mask(mask); + test(mask); + } + + void calculate_mask(test::Array2D &mask) { + constexpr size_t kHalfKernelSize = + kleidicv::get_half_kernel_size(kKernelSize); + auto half_kernel = + kleidicv::generate_gaussian_half_kernel(sigma_); + for (size_t row = 0; row < kKernelSize; ++row) { + for (size_t column = 0; column < kKernelSize; ++column) { + *mask.at(row, column) = + half_kernel[row >= kHalfKernelSize ? kKernelSize - 1 - row : row] * + half_kernel[column >= kHalfKernelSize ? kKernelSize - 1 - column + : column]; + } + } + } + private: kleidicv_error_t call_api(const test::Array2D *input, test::Array2D *output, @@ -89,8 +120,8 @@ class GaussianBlurTest : public test::KernelTest { ret = gaussian_blur()( input->data(), input->stride(), output->data(), output->stride(), input->width() / input->channels(), input->height(), input->channels(), - KernelTestParams::kKernelSize, KernelTestParams::kKernelSize, 0.0, 0.0, - border_type, context); + KernelTestParams::kKernelSize, KernelTestParams::kKernelSize, sigma_, + sigma_, border_type, context); auto releaseRet = kleidicv_filter_context_release(context); if (releaseRet != KLEIDICV_OK) { return releaseRet; @@ -102,27 +133,25 @@ class GaussianBlurTest : public test::KernelTest { // Apply rounding to nearest integer division. IntermediateType scale_result(const test::Kernel &, IntermediateType result) override { - if constexpr (KernelTestParams::kKernelSize == 3) { - return (result + 8) / 16; - } - if constexpr (KernelTestParams::kKernelSize == 5) { - return (result + 128) / 256; - } - if constexpr (KernelTestParams::kKernelSize == 7) { - return (result + 2048) / 4096; - } - if constexpr (KernelTestParams::kKernelSize == 15) { - return (result + 524288) / 1048576; - } - if constexpr (KernelTestParams::kKernelSize == 21) { - return (result + 32768) / 65536; + if (sigma_ == 0.0) { + if constexpr (KernelTestParams::kKernelSize == 3) { + return (result + 8) / 16; + } + if constexpr (KernelTestParams::kKernelSize == 5) { + return (result + 128) / 256; + } + if constexpr (KernelTestParams::kKernelSize == 7) { + return (result + 2048) / 4096; + } } + return (result + 32768) / 65536; } const ArrayContainerType array_layouts_; const BorderContainerType border_types_; test::SequenceGenerator array_layout_generator_; test::SequenceGenerator border_type_generator_; + float sigma_; }; // end of class GaussianBlurTest @@ -193,296 +222,105 @@ TYPED_TEST(GaussianBlur, 7x7) { GaussianBlurTest{KernelTestParams{}}.test(mask); } -// Tests gaussian_blur_15x15_ API. -TYPED_TEST(GaussianBlur, 15x15) { - using KernelTestParams = GaussianBlurKernelTestParams; - // 15x15 GaussianBlur operator. - test::Array2D mask{15, 15}; - // clang-format off - mask.set(0, 0, { 16, 44, 100, 192, 324, 472, 584, 632, 584, 472, 324, 192, 100, 44, 16 }); - mask.set(1, 0, { 44, 121, 275, 528, 891, 1298, 1606, 1738, 1606, 1298, 891, 528, 275, 121, 44 }); - mask.set(2, 0, { 100, 275, 625, 1200, 2025, 2950, 3650, 3950, 3650, 2950, 2025, 1200, 625, 275, 100 }); - mask.set(3, 0, { 192, 528, 1200, 2304, 3888, 5664, 7008, 7584, 7008, 5664, 3888, 2304, 1200, 528, 192 }); - mask.set(4, 0, { 324, 891, 2025, 3888, 6561, 9558, 11826, 12798, 11826, 9558, 6561, 3888, 2025, 891, 324 }); - mask.set(5, 0, { 472, 1298, 2950, 5664, 9558, 13924, 17228, 18644, 17228, 13924, 9558, 5664, 2950, 1298, 472 }); - mask.set(6, 0, { 584, 1606, 3650, 7008, 11826, 17228, 21316, 23068, 21316, 17228, 11826, 7008, 3650, 1606, 584 }); - mask.set(7, 0, { 632, 1738, 3950, 7584, 12798, 18644, 23068, 24964, 23068, 18644, 12798, 7584, 3950, 1738, 632 }); - mask.set(8, 0, { 584, 1606, 3650, 7008, 11826, 17228, 21316, 23068, 21316, 17228, 11826, 7008, 3650, 1606, 584 }); - mask.set(9, 0, { 472, 1298, 2950, 5664, 9558, 13924, 17228, 18644, 17228, 13924, 9558, 5664, 2950, 1298, 472 }); - mask.set(10, 0, { 324, 891, 2025, 3888, 6561, 9558, 11826, 12798, 11826, 9558, 6561, 3888, 2025, 891, 324 }); - mask.set(11, 0, { 192, 528, 1200, 2304, 3888, 5664, 7008, 7584, 7008, 5664, 3888, 2304, 1200, 528, 192 }); - mask.set(12, 0, { 100, 275, 625, 1200, 2025, 2950, 3650, 3950, 3650, 2950, 2025, 1200, 625, 275, 100 }); - mask.set(13, 0, { 44, 121, 275, 528, 891, 1298, 1606, 1738, 1606, 1298, 891, 528, 275, 121, 44 }); - mask.set(14, 0, { 16, 44, 100, 192, 324, 472, 584, 632, 584, 472, 324, 192, 100, 44, 16 }); - // clang-format on - GaussianBlurTest{KernelTestParams{}}.test(mask); -} - -// Tests gaussian_blur_21x21_ API. -TYPED_TEST(GaussianBlur, 21x21) { - using KernelTestParams = GaussianBlurKernelTestParams; - test::Array2D mask{21, 21}; - // clang-format off - mask.set(0, 0, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); - mask.set(1, 0, {0, 4, 4, 8, 12, 22, 30, 40, 50, 56, 60, 56, 50, 40, 30, 22, 12, 8, 4, 4, 0}); - mask.set(2, 0, {0, 4, 4, 8, 12, 22, 30, 40, 50, 56, 60, 56, 50, 40, 30, 22, 12, 8, 4, 4, 0}); - mask.set(3, 0, {0, 8, 8, 16, 24, 44, 60, 80, 100, 112, 120, 112, 100, 80, 60, 44, 24, 16, 8, 8, 0}); - mask.set(4, 0, {0, 12, 12, 24, 36, 66, 90, 120, 150, 168, 180, 168, 150, 120, 90, 66, 36, 24, 12, 12, 0}); - mask.set(5, 0, {0, 22, 22, 44, 66, 121, 165, 220, 275, 308, 330, 308, 275, 220, 165, 121, 66, 44, 22, 22, 0}); - mask.set(6, 0, {0, 30, 30, 60, 90, 165, 225, 300, 375, 420, 450, 420, 375, 300, 225, 165, 90, 60, 30, 30, 0}); - mask.set(7, 0, {0, 40, 40, 80, 120, 220, 300, 400, 500, 560, 600, 560, 500, 400, 300, 220, 120, 80, 40, 40, 0}); - mask.set(8, 0, {0, 50, 50, 100, 150, 275, 375, 500, 625, 700, 750, 700, 625, 500, 375, 275, 150, 100, 50, 50, 0}); - mask.set(9, 0, {0, 56, 56, 112, 168, 308, 420, 560, 700, 784, 840, 784, 700, 560, 420, 308, 168, 112, 56, 56, 0}); - mask.set(10, 0, {0, 60, 60, 120, 180, 330, 450, 600, 750, 840, 900, 840, 750, 600, 450, 330, 180, 120, 60, 60, 0}); - mask.set(11, 0, {0, 56, 56, 112, 168, 308, 420, 560, 700, 784, 840, 784, 700, 560, 420, 308, 168, 112, 56, 56, 0}); - mask.set(12, 0, {0, 50, 50, 100, 150, 275, 375, 500, 625, 700, 750, 700, 625, 500, 375, 275, 150, 100, 50, 50, 0}); - mask.set(13, 0, {0, 40, 40, 80, 120, 220, 300, 400, 500, 560, 600, 560, 500, 400, 300, 220, 120, 80, 40, 40, 0}); - mask.set(14, 0, {0, 30, 30, 60, 90, 165, 225, 300, 375, 420, 450, 420, 375, 300, 225, 165, 90, 60, 30, 30, 0}); - mask.set(15, 0, {0, 22, 22, 44, 66, 121, 165, 220, 275, 308, 330, 308, 275, 220, 165, 121, 66, 44, 22, 22, 0}); - mask.set(16, 0, {0, 12, 12, 24, 36, 66, 90, 120, 150, 168, 180, 168, 150, 120, 90, 66, 36, 24, 12, 12, 0}); - mask.set(17, 0, {0, 8, 8, 16, 24, 44, 60, 80, 100, 112, 120, 112, 100, 80, 60, 44, 24, 16, 8, 8, 0}); - mask.set(18, 0, {0, 4, 4, 8, 12, 22, 30, 40, 50, 56, 60, 56, 50, 40, 30, 22, 12, 8, 4, 4, 0}); - mask.set(19, 0, {0, 4, 4, 8, 12, 22, 30, 40, 50, 56, 60, 56, 50, 40, 30, 22, 12, 8, 4, 4, 0}); - mask.set(20, 0, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); - - // clang-format on - auto array_layouts = [](size_t w, size_t h) { - size_t vl = test::Options::vector_length(); - size_t margin = w / 2; - // two borders + one for the tail, so the NEON scalar path activates - size_t small_width = 2 * margin + 1; - // two borders + unrollonce + one for the tail - size_t medium_width = 2 * margin + vl / 4 + 1; - // two borders + unrolltwice + one for the tail - size_t big_width = 2 * margin + 2 * vl / 4 + 1; - return std::array{{ - {small_width, 2 * margin + 1, 1, 1}, - {medium_width, h, 1, 1}, - {big_width, h, 1, 1}, - }}; - }; - - GaussianBlurTest{KernelTestParams{}, array_layouts}.test(mask); -} +const auto minimal_array_layouts = [](size_t w, size_t h) { + size_t vl = test::Options::vector_length(); + size_t margin = w / 2; + // two borders + one for the tail, so the NEON scalar path activates + size_t small_width = 2 * margin + 1; + // two borders + unrolltwice + unrollonce + one for the tail + size_t big_width = 2 * margin + 3 * vl + 1; + return std::array{{ + {small_width, 2 * margin + 1, 1, 1}, + {big_width, h, 1, 1}, + }}; +}; TYPED_TEST(GaussianBlur, 3x3_CustomSigma) { - kleidicv_filter_context_t *context = nullptr; - ASSERT_EQ(KLEIDICV_OK, - kleidicv_filter_context_create(&context, 1, 15, 15, 18, 8)); - test::Array2D src{18, 8, test::Options::vector_length()}; - // clang-format off - src.set(0, 0, { 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99}); - src.set(1, 0, { 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11}); - src.set(2, 0, { 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22}); - src.set(3, 0, { 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33}); - src.set(4, 0, { 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44}); - src.set(5, 0, { 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55}); - src.set(6, 0, { 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66}); - src.set(7, 0, { 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77}); - // clang-format on - - test::Array2D dst{18, 8, test::Options::vector_length()}; - EXPECT_EQ(KLEIDICV_OK, - gaussian_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 18, 8, 1, 3, 3, 4.56, 4.56, - KLEIDICV_BORDER_TYPE_WRAP, context)); - EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(2.2) + .test_with_generated_mask(); +} - test::Array2D dst_expected{18, 8, test::Options::vector_length()}; - // clang-format off - dst_expected.set(0, 0, { 51, 51, 73, 74, 73, 62, 73, 84, 107, 118, 96, 63, 40, 51, 62, 73, 73, 62}); - dst_expected.set(1, 0, { 33, 33, 44, 55, 66, 77, 88, 110, 122, 122, 89, 66, 55, 66, 77, 77, 66, 44}); - dst_expected.set(2, 0, { 33, 44, 55, 66, 77, 88, 110, 122, 122, 89, 66, 55, 66, 77, 77, 66, 44, 33}); - dst_expected.set(3, 0, { 44, 55, 66, 77, 88, 110, 122, 122, 89, 66, 55, 66, 77, 77, 66, 44, 33, 33}); - dst_expected.set(4, 0, { 55, 66, 77, 88, 110, 122, 122, 89, 66, 55, 66, 77, 77, 66, 44, 33, 33, 44}); - dst_expected.set(5, 0, { 66, 77, 88, 110, 122, 122, 89, 66, 55, 66, 77, 77, 66, 44, 33, 33, 44, 55}); - dst_expected.set(6, 0, { 77, 88, 110, 122, 122, 89, 66, 55, 66, 77, 77, 66, 44, 33, 33, 44, 55, 66}); - dst_expected.set(7, 0, { 70, 70, 92, 103, 92, 70, 59, 70, 81, 103, 92, 70, 37, 37, 48, 59, 70, 70}); - // clang-format on - EXPECT_EQ_ARRAY2D(dst_expected, dst); +TYPED_TEST(GaussianBlur, 3x3_TinySigma) { + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(0.01) + .test_with_generated_mask(); } TYPED_TEST(GaussianBlur, 5x5_CustomSigma) { - kleidicv_filter_context_t *context = nullptr; - ASSERT_EQ(KLEIDICV_OK, - kleidicv_filter_context_create(&context, 1, 15, 15, 20, 8)); - test::Array2D src{20, 8, test::Options::vector_length()}; - // clang-format off - src.set(0, 0, { 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22}); - src.set(1, 0, { 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33}); - src.set(2, 0, { 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44}); - src.set(3, 0, { 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55}); - src.set(4, 0, { 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66}); - src.set(5, 0, { 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77}); - src.set(6, 0, { 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88}); - src.set(7, 0, { 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99}); - // clang-format on - - test::Array2D dst{20, 8, test::Options::vector_length()}; - EXPECT_EQ(KLEIDICV_OK, - gaussian_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 20, 8, 1, 5, 5, 4.56, 4.56, - KLEIDICV_BORDER_TYPE_WRAP, context)); - EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(2.2) + .test_with_generated_mask(); +} - test::Array2D dst_expected{20, 8, test::Options::vector_length()}; - // clang-format off - dst_expected.set(0, 0, { 54, 65, 72, 75, 78, 81, 84, 88, 96, 91, 82, 68, 59, 54, 58, 61, 60, 59, 54, 52}); - dst_expected.set(1, 0, { 48, 58, 61, 68, 75, 86, 90, 98, 101, 92, 79, 70, 64, 60, 64, 62, 57, 52, 47, 45}); - dst_expected.set(2, 0, { 42, 48, 55, 66, 81, 92, 100, 103, 102, 89, 80, 74, 70, 66, 65, 59, 51, 45, 40, 39}); - dst_expected.set(3, 0, { 53, 59, 66, 81, 92, 100, 103, 102, 89, 80, 74, 70, 66, 65, 59, 51, 45, 44, 43, 46}); - dst_expected.set(4, 0, { 64, 70, 81, 92, 100, 103, 102, 89, 80, 74, 70, 66, 65, 59, 51, 45, 44, 48, 51, 57}); - dst_expected.set(5, 0, { 75, 85, 92, 100, 103, 102, 89, 80, 74, 70, 66, 65, 59, 51, 45, 44, 48, 55, 62, 68}); - dst_expected.set(6, 0, { 69, 79, 87, 94, 97, 91, 82, 76, 79, 76, 75, 69, 60, 48, 46, 50, 53, 61, 63, 66}); - dst_expected.set(7, 0, { 62, 73, 80, 87, 86, 84, 79, 81, 86, 85, 80, 71, 58, 49, 52, 56, 59, 62, 61, 59}); - // clang-format on - EXPECT_EQ_ARRAY2D(dst_expected, dst); +TYPED_TEST(GaussianBlur, 5x5_TinySigma) { + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(0.01) + .test_with_generated_mask(); } TYPED_TEST(GaussianBlur, 7x7_CustomSigma) { - kleidicv_filter_context_t *context = nullptr; - ASSERT_EQ(KLEIDICV_OK, - kleidicv_filter_context_create(&context, 1, 15, 15, 23, 8)); - test::Array2D src{23, 8, test::Options::vector_length()}; - // clang-format off - src.set(0, 0, { 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55}); - src.set(1, 0, { 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66}); - src.set(2, 0, { 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77}); - src.set(3, 0, { 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88}); - src.set(4, 0, { 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99}); - src.set(5, 0, { 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111}); - src.set(6, 0, { 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222}); - src.set(7, 0, { 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33}); - // clang-format on - - test::Array2D dst{23, 8, test::Options::vector_length()}; - EXPECT_EQ(KLEIDICV_OK, - gaussian_blur()(src.data(), src.stride(), dst.data(), - dst.stride(), 23, 8, 1, 7, 7, 4.56, 4.56, - KLEIDICV_BORDER_TYPE_WRAP, context)); - EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(2.2) + .test_with_generated_mask(); +} - test::Array2D dst_expected{23, 8, test::Options::vector_length()}; - // clang-format off - dst_expected.set(0, 0, { 76, 78, 77, 76, 82, 87, 90, 90, 85, 81, 77, 71, 65, 60, 56, 55, 56, 58, 62, 67, 68, 69, 71}); - dst_expected.set(1, 0, { 73, 75, 73, 75, 83, 88, 91, 91, 87, 83, 77, 72, 66, 61, 56, 55, 55, 56, 60, 65, 65, 66, 69}); - dst_expected.set(2, 0, { 69, 70, 72, 76, 84, 89, 92, 92, 89, 83, 78, 72, 67, 62, 57, 55, 54, 54, 58, 61, 61, 62, 65}); - dst_expected.set(3, 0, { 69, 73, 77, 78, 86, 91, 93, 93, 88, 83, 78, 72, 67, 61, 57, 53, 52, 52, 55, 62, 63, 64, 67}); - dst_expected.set(4, 0, { 82, 85, 85, 86, 91, 93, 93, 88, 83, 78, 72, 67, 61, 57, 53, 52, 52, 55, 62, 70, 72, 76, 78}); - dst_expected.set(5, 0, { 82, 85, 85, 85, 90, 92, 90, 88, 83, 77, 72, 67, 62, 58, 53, 52, 53, 56, 64, 72, 74, 76, 78}); - dst_expected.set(6, 0, { 81, 84, 83, 83, 87, 88, 89, 88, 83, 78, 73, 68, 64, 58, 54, 53, 54, 57, 65, 72, 73, 74, 77}); - dst_expected.set(7, 0, { 78, 81, 80, 80, 83, 87, 89, 88, 84, 79, 74, 70, 64, 59, 54, 54, 55, 58, 65, 70, 70, 72, 74}); - // clang-format on - EXPECT_EQ_ARRAY2D(dst_expected, dst); +TYPED_TEST(GaussianBlur, 7x7_TinySigma) { + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(0.01) + .test_with_generated_mask(); } +// Tests gaussian_blur_15x15_ API. It always uses CustomSigma. TYPED_TEST(GaussianBlur, 15x15_CustomSigma) { - kleidicv_filter_context_t *context = nullptr; - ASSERT_EQ(KLEIDICV_OK, - kleidicv_filter_context_create(&context, 1, 15, 15, 40, 22)); - test::Array2D src{40, 22, test::Options::vector_length()}; - // clang-format off - src.set(0, 0, { 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, - 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44}); - src.set(1, 0, { 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, - 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55}); - src.set(2, 0, { 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, - 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66}); - src.set(3, 0, { 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, - 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77}); - src.set(4, 0, { 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, - 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88}); - src.set(5, 0, { 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, - 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99}); - src.set(6, 0, { 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, - 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111}); - src.set(7, 0, { 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, - 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222}); - src.set(8, 0, { 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, - 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33}); - src.set(9, 0, { 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, - 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44}); - src.set(10, 0, { 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, - 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55}); - src.set(11, 0, { 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, - 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66}); - src.set(12, 0, { 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, - 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77}); - src.set(13, 0, { 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, - 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88}); - src.set(14, 0, { 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, - 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99}); - src.set(15, 0, { 77, 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, - 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11}); - src.set(16, 0, { 88, 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, - 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22}); - src.set(17, 0, { 99, 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, - 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33}); - src.set(18, 0, { 11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, - 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44}); - src.set(19, 0, { 22, 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, - 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55}); - src.set(20, 0, { 33, 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, - 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66}); - src.set(21, 0, { 44, 55, 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, - 66, 77, 88, 99, 111, 222, 33, 44, 55, 66, 77, 88, 99, 11, 22, 33, 44, 55, 66, 77}); - // clang-format on + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .test_with_generated_mask(); + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(2.2) + .test_with_generated_mask(); +} - test::Array2D dst{40, 22, test::Options::vector_length()}; - EXPECT_EQ(KLEIDICV_OK, - gaussian_blur()( - src.data(), src.stride(), dst.data(), dst.stride(), 40, 22, 1, - 15, 15, 4.56, 4.56, KLEIDICV_BORDER_TYPE_WRAP, context)); - EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); +TYPED_TEST(GaussianBlur, 15x15_TinySigma) { + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(0.01) + .test_with_generated_mask(); +} - test::Array2D dst_expected{40, 22, test::Options::vector_length()}; - // clang-format off - dst_expected.set(0, 0, { 60, 63, 67, 71, 74, 77, 79, 81, 81, 81, 79, 77, 73, 70, 66, 64, 62, 61, 63, 65, - 68, 71, 74, 76, 79, 81, 81, 81, 79, 77, 73, 70, 66, 63, 60, 57, 56, 56, 56, 58}); - dst_expected.set(1, 0, { 63, 66, 70, 74, 77, 79, 80, 81, 81, 80, 78, 75, 71, 68, 65, 63, 62, 62, 64, 66, - 69, 73, 75, 78, 79, 81, 81, 80, 78, 75, 71, 68, 65, 62, 59, 58, 57, 58, 59, 60}); - dst_expected.set(2, 0, { 67, 70, 73, 76, 79, 80, 81, 80, 80, 78, 76, 73, 70, 67, 64, 63, 62, 63, 65, 68, - 71, 74, 76, 78, 80, 80, 80, 78, 76, 73, 70, 67, 64, 62, 60, 59, 59, 60, 62, 64}); - dst_expected.set(3, 0, { 71, 74, 76, 79, 80, 80, 80, 79, 78, 76, 74, 71, 68, 66, 64, 63, 63, 65, 67, 70, - 73, 76, 77, 79, 79, 79, 78, 76, 74, 71, 68, 66, 64, 62, 61, 62, 62, 64, 66, 68}); - dst_expected.set(4, 0, { 74, 77, 79, 80, 80, 80, 79, 78, 77, 74, 72, 69, 67, 65, 64, 64, 65, 67, 69, 72, - 75, 77, 78, 79, 79, 78, 77, 74, 72, 69, 67, 65, 64, 63, 63, 64, 66, 68, 70, 72}); - dst_expected.set(5, 0, { 77, 79, 80, 80, 80, 79, 78, 76, 74, 72, 70, 68, 66, 65, 65, 66, 67, 69, 71, 74, - 76, 77, 78, 78, 77, 76, 74, 72, 70, 68, 66, 65, 65, 65, 66, 67, 69, 71, 73, 75}); - dst_expected.set(6, 0, { 79, 80, 81, 80, 79, 78, 76, 74, 72, 70, 68, 67, 66, 66, 66, 67, 69, 71, 73, 75, - 77, 77, 77, 77, 76, 74, 72, 70, 68, 67, 66, 66, 66, 67, 68, 70, 72, 74, 76, 78}); - dst_expected.set(7, 0, { 81, 81, 80, 79, 78, 76, 74, 72, 70, 69, 67, 66, 66, 67, 68, 70, 71, 73, 75, 76, - 77, 77, 76, 75, 74, 72, 70, 69, 67, 66, 66, 67, 68, 69, 71, 73, 75, 77, 79, 80}); - dst_expected.set(8, 0, { 81, 81, 80, 78, 77, 74, 72, 70, 69, 67, 66, 66, 67, 68, 70, 71, 73, 75, 76, 77, - 77, 76, 75, 74, 72, 70, 69, 67, 66, 66, 67, 68, 70, 71, 73, 76, 78, 79, 81, 81}); - dst_expected.set(9, 0, { 81, 80, 78, 76, 74, 72, 70, 69, 67, 66, 66, 67, 68, 70, 71, 73, 75, 76, 77, 77, - 76, 75, 74, 72, 70, 69, 67, 66, 66, 67, 68, 70, 71, 73, 76, 78, 79, 81, 81, 81}); - dst_expected.set(10, 0, { 79, 78, 76, 74, 72, 70, 68, 67, 66, 66, 67, 68, 70, 71, 73, 75, 76, 77, 77, 76, - 75, 74, 72, 70, 69, 67, 66, 66, 67, 68, 70, 71, 73, 75, 77, 79, 80, 81, 81, 80}); - dst_expected.set(11, 0, { 77, 75, 73, 71, 69, 68, 67, 66, 66, 67, 68, 70, 71, 73, 75, 76, 77, 77, 76, 75, - 74, 72, 70, 69, 67, 66, 66, 67, 68, 70, 71, 73, 75, 77, 78, 79, 80, 80, 79, 78}); - dst_expected.set(12, 0, { 73, 71, 70, 68, 67, 66, 66, 66, 67, 68, 70, 71, 73, 75, 76, 77, 77, 76, 75, 74, - 72, 70, 69, 67, 66, 66, 67, 68, 70, 71, 73, 75, 76, 78, 78, 79, 78, 78, 76, 75}); - dst_expected.set(13, 0, { 70, 68, 67, 66, 65, 65, 66, 67, 68, 70, 71, 73, 75, 76, 77, 77, 76, 75, 74, 72, - 70, 69, 67, 66, 66, 67, 68, 70, 71, 73, 75, 76, 77, 78, 78, 77, 76, 75, 73, 72}); - dst_expected.set(14, 0, { 66, 65, 64, 64, 64, 65, 66, 68, 70, 71, 73, 75, 76, 77, 77, 76, 75, 74, 72, 70, - 69, 67, 66, 66, 67, 68, 70, 71, 73, 75, 76, 77, 77, 77, 76, 75, 74, 72, 70, 68}); - dst_expected.set(15, 0, { 63, 62, 62, 62, 63, 65, 67, 69, 71, 73, 75, 77, 78, 78, 77, 76, 74, 72, 70, 68, - 67, 66, 66, 66, 68, 69, 71, 73, 75, 77, 78, 78, 77, 76, 74, 72, 70, 68, 66, 64}); - dst_expected.set(16, 0, { 60, 59, 60, 61, 63, 66, 68, 71, 73, 76, 77, 78, 78, 78, 76, 74, 72, 70, 68, 66, - 66, 65, 66, 67, 69, 71, 73, 76, 77, 78, 78, 78, 76, 74, 72, 69, 67, 64, 62, 60}); - dst_expected.set(17, 0, { 57, 58, 59, 62, 64, 67, 70, 73, 76, 78, 79, 79, 79, 77, 75, 73, 70, 67, 66, 65, - 65, 65, 66, 68, 70, 73, 76, 78, 79, 79, 79, 77, 75, 72, 69, 66, 63, 61, 59, 58}); - dst_expected.set(18, 0, { 56, 57, 59, 62, 66, 69, 72, 75, 78, 79, 80, 80, 78, 76, 74, 71, 68, 65, 64, 64, - 64, 66, 67, 70, 72, 75, 78, 79, 80, 80, 78, 76, 74, 70, 67, 63, 60, 58, 57, 56}); - dst_expected.set(19, 0, { 56, 58, 60, 64, 68, 71, 74, 77, 79, 81, 81, 80, 78, 75, 72, 68, 65, 63, 63, 63, - 64, 66, 69, 71, 74, 77, 79, 81, 81, 80, 78, 75, 72, 68, 64, 61, 58, 56, 55, 55}); - dst_expected.set(20, 0, { 56, 59, 62, 66, 70, 73, 76, 79, 81, 81, 81, 79, 76, 73, 70, 66, 64, 62, 62, 63, - 65, 68, 70, 73, 76, 79, 81, 81, 81, 79, 76, 73, 70, 66, 62, 59, 57, 55, 55, 55}); - dst_expected.set(21, 0, { 58, 60, 64, 68, 72, 75, 78, 80, 81, 81, 80, 78, 75, 72, 68, 65, 63, 62, 62, 64, - 66, 69, 72, 75, 78, 80, 81, 81, 80, 78, 75, 72, 68, 64, 60, 58, 56, 55, 55, 55}); - // clang-format on - EXPECT_EQ_ARRAY2D(dst_expected, dst); +// Tests gaussian_blur_21x21_ API. It always uses CustomSigma. +TYPED_TEST(GaussianBlur, 21x21_CustomSigma) { + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .test_with_generated_mask(); + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(2.2) + .test_with_generated_mask(); +} + +TYPED_TEST(GaussianBlur, 21x21_TinySigma) { + using KernelTestParams = GaussianBlurKernelTestParams; + GaussianBlurTest{KernelTestParams{}, minimal_array_layouts, kAllBorders, + kToleranceOne} + .with_sigma(0.01) + .test_with_generated_mask(); } TYPED_TEST(GaussianBlur, UnsupportedBorderType3x3) { @@ -864,7 +702,7 @@ TYPED_TEST(GaussianBlur, ValidImageSize5x5) { expected.set(1, 0, {8, 9, 9, 10}); expected.set(2, 0, {9, 9, 10, 11}); expected.set(3, 0, {10, 11, 12, 12}); - EXPECT_EQ_ARRAY2D(expected, dst); + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, expected, dst); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } @@ -911,80 +749,7 @@ TYPED_TEST(GaussianBlur, ValidImageSize7x7) { expected.set(3, 0, {18, 18, 19, 19, 19, 19}); expected.set(4, 0, {22, 22, 23, 23, 23, 23}); expected.set(5, 0, {24, 24, 24, 24, 24, 24}); - EXPECT_EQ_ARRAY2D(expected, dst); - - EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); -} - -TYPED_TEST(GaussianBlur, ValidImageSize15x15) { - using KernelTestParams = GaussianBlurKernelTestParams; - kleidicv_filter_context_t *context = nullptr; - size_t validSize = KernelTestParams::kKernelSize - 1; - ASSERT_EQ(KLEIDICV_OK, kleidicv_filter_context_create(&context, 1, 15, 15, - validSize, validSize)); - test::Array2D src{validSize, validSize, - test::Options::vector_length()}; - src.set(0, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); - src.set(1, 0, {28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15}); - src.set(2, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); - src.set(3, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); - src.set(4, 0, {28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15}); - src.set(5, 0, {28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15}); - src.set(6, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); - src.set(7, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); - src.set(8, 0, {28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15}); - src.set(9, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); - src.set(10, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); - src.set(11, 0, {247, 207, 167, 127, 87, 47, 7, 3, 7, 47, 87, 127, 167, 207}); - src.set(12, 0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}); - src.set( - 13, 0, - {255, 254, 253, 252, 251, 250, 249, 248, 247, 246, 245, 244, 243, 242}); - - test::Array2D dst{validSize, validSize, - test::Options::vector_length()}; - test::Array2D expected{validSize, validSize, - test::Options::vector_length()}; - - EXPECT_EQ(KLEIDICV_OK, gaussian_blur()( - src.data(), src.stride(), dst.data(), dst.stride(), - validSize, validSize, 1, 15, 15, 0.0, 0.0, - KLEIDICV_BORDER_TYPE_REVERSE, context)); - expected.set(0, 0, {13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14}); - expected.set(1, 0, {13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14}); - expected.set(2, 0, {13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14}); - expected.set(3, 0, {13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}); - expected.set(4, 0, {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15}); - expected.set(5, 0, {15, 15, 15, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15}); - expected.set(6, 0, {17, 17, 17, 16, 16, 15, 15, 15, 16, 16, 17, 17, 18, 18}); - expected.set(7, 0, {21, 21, 20, 20, 19, 18, 17, 17, 18, 19, 20, 21, 21, 22}); - expected.set(8, 0, {29, 29, 28, 26, 24, 22, 21, 21, 22, 23, 25, 27, 28, 29}); - expected.set(9, 0, {40, 40, 38, 35, 32, 30, 28, 28, 29, 31, 33, 36, 38, 38}); - expected.set(10, 0, {54, 53, 50, 47, 43, 39, 37, 36, 38, 40, 44, 47, 49, 50}); - expected.set(11, 0, {67, 66, 63, 58, 54, 50, 47, 46, 47, 50, 54, 58, 61, 62}); - expected.set(12, 0, {76, 75, 72, 67, 62, 57, 54, 53, 54, 58, 62, 67, 70, 71}); - expected.set(13, 0, {80, 79, 76, 71, 65, 60, 57, 56, 57, 61, 66, 70, 73, 75}); - EXPECT_EQ_ARRAY2D(expected, dst); - - EXPECT_EQ(KLEIDICV_OK, gaussian_blur()( - src.data(), src.stride(), dst.data(), dst.stride(), - validSize, validSize, 1, 15, 15, 2.25, 2.25, - KLEIDICV_BORDER_TYPE_REVERSE, context)); - expected.set(0, 0, {13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14}); - expected.set(1, 0, {13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14}); - expected.set(2, 0, {13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14}); - expected.set(3, 0, {13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14}); - expected.set(4, 0, {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14}); - expected.set(5, 0, {15, 15, 15, 15, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15}); - expected.set(6, 0, {15, 15, 15, 15, 14, 14, 14, 14, 14, 15, 15, 16, 16, 16}); - expected.set(7, 0, {19, 19, 19, 18, 17, 16, 16, 16, 16, 17, 18, 19, 20, 20}); - expected.set(8, 0, {26, 26, 25, 23, 21, 19, 18, 18, 19, 20, 22, 24, 26, 26}); - expected.set(9, 0, {38, 37, 35, 32, 29, 26, 24, 24, 25, 27, 30, 34, 36, 37}); - expected.set(10, 0, {55, 54, 51, 46, 42, 37, 35, 34, 35, 39, 43, 47, 51, 52}); - expected.set(11, 0, {71, 70, 66, 61, 55, 49, 46, 45, 47, 51, 56, 61, 65, 67}); - expected.set(12, 0, {85, 83, 79, 73, 66, 60, 57, 55, 57, 61, 67, 73, 77, 79}); - expected.set(13, 0, {89, 88, 83, 77, 71, 65, 61, 60, 62, 66, 72, 77, 82, 83}); - EXPECT_EQ_ARRAY2D(expected, dst); + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(1, expected, dst); EXPECT_EQ(KLEIDICV_OK, kleidicv_filter_context_release(context)); } @@ -1252,6 +1017,7 @@ static std::array generate_reference_kernel(float sigma) { return kernel_to_return; } + template void test_sigma() { const std::array expected_half_kernel = diff --git a/test/framework/kernel.h b/test/framework/kernel.h index 5db22e3cce584bef2ae4dce549a10f46cf8aa028..a4720ed7be69613822aa23f94c529500f2acf804 100644 --- a/test/framework/kernel.h +++ b/test/framework/kernel.h @@ -74,7 +74,8 @@ class KernelTest { using IntermediateType = typename KernelTestParams::IntermediateType; using OutputType = typename KernelTestParams::OutputType; - KernelTest() : debug_{false} {} + explicit KernelTest(size_t tolerance = 0) + : debug_{false}, tolerance_{tolerance} {} // Enables debug mode. KernelTest& with_debug() { @@ -277,7 +278,7 @@ class KernelTest { EXPECT_EQ(KLEIDICV_OK, err); // Check that the actual result matches the expectation. - EXPECT_EQ_ARRAY2D(expected_, actual_); + EXPECT_EQ_ARRAY2D_WITH_TOLERANCE(tolerance_, expected_, actual_); } // Input operand for the operation. @@ -290,6 +291,8 @@ class KernelTest { Array2D actual_; // Enables debug mode. bool debug_; + // Error tolerance, absolute value + size_t tolerance_; }; // end of class KernelTest } // namespace test