diff --git a/kleidicv/src/filters/gaussian_blur_neon.cpp b/kleidicv/src/filters/gaussian_blur_neon.cpp index bf4d82aa19bbfb2c3610dc0b5a23ba57ed530972..1fe35b2905c3e205d06cba34ef278ca8b4ca7a1f 100644 --- a/kleidicv/src/filters/gaussian_blur_neon.cpp +++ b/kleidicv/src/filters/gaussian_blur_neon.cpp @@ -520,35 +520,45 @@ class GaussianBlur { using BufferType = uint32_t; using DestinationType = uint8_t; + static constexpr size_t kHalfKernelSize = get_half_kernel_size(KernelSize); + + // NOLINTNEXTLINE - hicpp-member-init explicit GaussianBlur(float sigma) - : half_kernel_( - generate_gaussian_half_kernel( - sigma)) {} + : half_kernel_(generate_gaussian_half_kernel(sigma)) { + for (size_t i = 0; i < kHalfKernelSize; i++) { + half_kernel_u16_[i] = vdupq_n_u16(half_kernel_[i]); + half_kernel_u32_[i] = vdupq_n_u32(half_kernel_[i]); + } + } void vertical_vector_path(uint8x16_t src[KernelSize], BufferType *dst) const { uint16x8_t initial_l = vmovl_u8(vget_low_u8(src[KernelSize >> 1])); - uint16x8_t initial_h = vmovl_u8(vget_high_u8(src[KernelSize >> 1])); + uint16x8_t initial_h = vmovl_high_u8(src[KernelSize >> 1]); uint32x4_t acc_l_l = - vmull_n_u16(vget_low_u16(initial_l), half_kernel_[KernelSize >> 1]); + vmull_u16(vget_low_u16(initial_l), + vget_low_u16(half_kernel_u16_[KernelSize >> 1])); uint32x4_t acc_l_h = - vmull_n_u16(vget_high_u16(initial_l), half_kernel_[KernelSize >> 1]); + vmull_high_u16(initial_l, half_kernel_u16_[KernelSize >> 1]); uint32x4_t acc_h_l = - vmull_n_u16(vget_low_u16(initial_h), half_kernel_[KernelSize >> 1]); + vmull_u16(vget_low_u16(initial_h), + vget_low_u16(half_kernel_u16_[KernelSize >> 1])); uint32x4_t acc_h_h = - vmull_n_u16(vget_high_u16(initial_h), half_kernel_[KernelSize >> 1]); + vmull_high_u16(initial_h, half_kernel_u16_[KernelSize >> 1]); // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = 0; i < (KernelSize >> 1); i++) { const size_t j = KernelSize - i - 1; uint16x8_t vec_l = vaddl_u8(vget_low_u8(src[i]), vget_low_u8(src[j])); - uint16x8_t vec_h = vaddl_u8(vget_high_u8(src[i]), vget_high_u8(src[j])); - - acc_l_l = vmlal_n_u16(acc_l_l, vget_low_u16(vec_l), half_kernel_[i]); - acc_l_h = vmlal_n_u16(acc_l_h, vget_high_u16(vec_l), half_kernel_[i]); - acc_h_l = vmlal_n_u16(acc_h_l, vget_low_u16(vec_h), half_kernel_[i]); - acc_h_h = vmlal_n_u16(acc_h_h, vget_high_u16(vec_h), half_kernel_[i]); + uint16x8_t vec_h = vaddl_high_u8(src[i], src[j]); + + acc_l_l = vmlal_u16(acc_l_l, vget_low_u16(vec_l), + vget_low_u16(half_kernel_u16_[i])); + acc_l_h = vmlal_high_u16(acc_l_h, vec_l, half_kernel_u16_[i]); + acc_h_l = vmlal_u16(acc_h_l, vget_low_u16(vec_h), + vget_low_u16(half_kernel_u16_[i])); + acc_h_h = vmlal_high_u16(acc_h_h, vec_h, half_kernel_u16_[i]); } uint32x4x4_t result = {acc_l_l, acc_l_h, acc_h_l, acc_h_h}; @@ -578,14 +588,14 @@ class GaussianBlur { void horizontal_vector_path(uint32x4_t src[KernelSize], DestinationType *dst) const { uint32x4_t acc = - vmulq_n_u32(src[KernelSize >> 1], half_kernel_[KernelSize >> 1]); + vmulq_u32(src[KernelSize >> 1], half_kernel_u32_[KernelSize >> 1]); // Optimization to avoid unnecessary branching in vector code. KLEIDICV_FORCE_LOOP_UNROLL for (size_t i = 0; i < (KernelSize >> 1); i++) { const size_t j = KernelSize - i - 1; uint32x4_t vec_inner = vaddq_u32(src[i], src[j]); - acc = vmlaq_n_u32(acc, vec_inner, half_kernel_[i]); + acc = vmlaq_u32(acc, vec_inner, half_kernel_u32_[i]); } uint32x4_t acc_u32 = vrshrq_n_u32(acc, 16); @@ -616,7 +626,9 @@ class GaussianBlur { } private: - const std::array half_kernel_; + const std::array half_kernel_; + uint16x8_t half_kernel_u16_[kHalfKernelSize]; + uint32x4_t half_kernel_u32_[kHalfKernelSize]; }; // end of class GaussianBlur template @@ -665,17 +677,12 @@ static kleidicv_error_t gaussian_blur(size_t kernel_size, const ScalarType *src, } } -KLEIDICV_TARGET_FN_ATTRS -kleidicv_error_t gaussian_blur_u8(const uint8_t *src, size_t src_stride, - uint8_t *dst, size_t dst_stride, size_t width, - size_t height, size_t channels, - size_t kernel_width, size_t kernel_height, - float sigma_x, float sigma_y, - kleidicv_border_type_t border_type, - kleidicv_filter_context_t *context) { - CHECK_POINTERS(context); - auto *workspace = reinterpret_cast(context); - auto fixed_border_type = get_fixed_border_type(border_type); +template +static kleidicv_error_t gaussian_blur_checks( + const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, + size_t height, size_t channels, size_t kernel_width, size_t kernel_height, + float sigma_x, float sigma_y, SeparableFilterWorkspace *workspace) { + CHECK_POINTERS(workspace); if (kernel_width != kernel_height) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; @@ -707,10 +714,33 @@ kleidicv_error_t gaussian_blur_u8(const uint8_t *src, size_t src_stride, return KLEIDICV_ERROR_CONTEXT_MISMATCH; } + return KLEIDICV_OK; +} + +KLEIDICV_TARGET_FN_ATTRS +kleidicv_error_t gaussian_blur_u8(const uint8_t *src, size_t src_stride, + uint8_t *dst, size_t dst_stride, size_t width, + size_t height, size_t channels, + size_t kernel_width, size_t kernel_height, + float sigma_x, float sigma_y, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) { + auto *workspace = reinterpret_cast(context); + kleidicv_error_t checks_result = gaussian_blur_checks( + src, src_stride, dst, dst_stride, width, height, channels, kernel_width, + kernel_height, sigma_x, sigma_y, workspace); + + if (checks_result != KLEIDICV_OK) { + return checks_result; + } + + auto fixed_border_type = get_fixed_border_type(border_type); if (!fixed_border_type) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } + Rectangle rect{width, height}; + if (sigma_x == 0.0) { return gaussian_blur(kernel_width, src, src_stride, dst, dst_stride, rect, channels, sigma_x, *fixed_border_type, diff --git a/kleidicv/src/filters/gaussian_blur_sc.h b/kleidicv/src/filters/gaussian_blur_sc.h index e11fb8a58466e1039bd9fed8e151a947d69a8123..3a3b2682e88f56a5aa984b74d2e8b431e0dab770 100644 --- a/kleidicv/src/filters/gaussian_blur_sc.h +++ b/kleidicv/src/filters/gaussian_blur_sc.h @@ -856,15 +856,13 @@ static kleidicv_error_t gaussian_blur( } } -static kleidicv_error_t gaussian_blur_u8_sc( - const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, - size_t width, size_t height, size_t channels, size_t kernel_width, - size_t kernel_height, float sigma_x, float sigma_y, - kleidicv_border_type_t border_type, - kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { - CHECK_POINTERS(context); - auto *workspace = reinterpret_cast(context); - auto fixed_border_type = get_fixed_border_type(border_type); +template +static kleidicv_error_t gaussian_blur_checks( + const T *src, size_t src_stride, T *dst, size_t dst_stride, size_t width, + size_t height, size_t channels, size_t kernel_width, size_t kernel_height, + float sigma_x, float sigma_y, + SeparableFilterWorkspace *workspace) KLEIDICV_STREAMING_COMPATIBLE { + CHECK_POINTERS(workspace); if (kernel_width != kernel_height) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; @@ -890,16 +888,36 @@ static kleidicv_error_t gaussian_blur_u8_sc( return KLEIDICV_ERROR_CONTEXT_MISMATCH; } - Rectangle rect{width, height}; const Rectangle &context_rect = workspace->image_size(); if (context_rect.width() < width || context_rect.height() < height) { return KLEIDICV_ERROR_CONTEXT_MISMATCH; } + return KLEIDICV_OK; +} + +static kleidicv_error_t gaussian_blur_u8_sc( + const uint8_t *src, size_t src_stride, uint8_t *dst, size_t dst_stride, + size_t width, size_t height, size_t channels, size_t kernel_width, + size_t kernel_height, float sigma_x, float sigma_y, + kleidicv_border_type_t border_type, + kleidicv_filter_context_t *context) KLEIDICV_STREAMING_COMPATIBLE { + auto *workspace = reinterpret_cast(context); + kleidicv_error_t checks_result = gaussian_blur_checks( + src, src_stride, dst, dst_stride, width, height, channels, kernel_width, + kernel_height, sigma_x, sigma_y, workspace); + + if (checks_result != KLEIDICV_OK) { + return checks_result; + } + + auto fixed_border_type = get_fixed_border_type(border_type); if (!fixed_border_type) { return KLEIDICV_ERROR_NOT_IMPLEMENTED; } + Rectangle rect{width, height}; + if (sigma_x == 0.0) { return gaussian_blur(kernel_width, src, src_stride, dst, dst_stride, rect, channels, sigma_x, *fixed_border_type,