diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c index d987b2f472b78d4a262c5128b3b1f0ec83e99b5e..8b649f3f8a4bf3af9dd4608a600517edec9ebe35 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm.c @@ -235,13 +235,13 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_4x4x32_neon_i8mm( "mov x23, %x[dst]\n" "cmp x27, #0x1\n" "add x22, x23, %x[dst_stride_row]\n" - "csel x22, x22, x23, GE\n" + "csel x22, x22, x23, GT\n" "cmp x27, #0x2\n" "add x21, x23, %x[dst_stride_row], LSL #1\n" - "csel x21, x21, x22, GE\n" + "csel x21, x21, x22, GT\n" "cmp x27, #0x3\n" "add x20, x21, %x[dst_stride_row]\n" - "csel x20, x20, x21, GE\n" + "csel x20, x20, x21, GT\n" "tbz x25, #1, 6f\n" "st1 { v0.d }[0], [x20], #0x8\n" "st1 { v1.d }[0], [x21], #0x8\n" diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c index b93f81476d0016b04aa3ac382f6c585f0b814975..50e260d89bc587696e01d941029ee690469a0c0b 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm.c @@ -464,13 +464,13 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp4x8_8x4x32_neon_i8mm( "mov x23, %x[dst]\n" "cmp x12, #0x1\n" "add x22, x23, %x[dst_stride_row]\n" - "csel x22, x22, x23, GE\n" + "csel x22, x22, x23, GT\n" "cmp x12, #0x2\n" "add x21, x23, %x[dst_stride_row], LSL #1\n" - "csel x21, x21, x22, GE\n" + "csel x21, x21, x22, GT\n" "cmp x12, #0x3\n" "add x20, x21, %x[dst_stride_row]\n" - "csel x20, x20, x21, GE\n" + "csel x20, x20, x21, GT\n" "tbz x25, #1, 16f\n" "st1 { v7.d }[0], [x20], #0x8\n" "st1 { v8.d }[0], [x21], #0x8\n" diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c index 1f7d0b8f92e8205d3a2dacaa75be3b3984e82dcb..6a07803c2e7d6b6c338cdd0753260670d3595dfb 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm.c @@ -305,13 +305,13 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_4x8x32_neon_i8mm( "mov x23, %x[dst]\n" "cmp x27, #0x1\n" "add x22, x23, %x[dst_stride_row]\n" - "csel x22, x22, x23, GE\n" + "csel x22, x22, x23, GT\n" "cmp x27, #0x2\n" "add x21, x23, %x[dst_stride_row], LSL #1\n" - "csel x21, x21, x22, GE\n" + "csel x21, x21, x22, GT\n" "cmp x27, #0x3\n" "add x20, x21, %x[dst_stride_row]\n" - "csel x20, x20, x21, GE\n" + "csel x20, x20, x21, GT\n" "tbz x25, #2, 7f\n" "st1 { v5.4s }, [x20], #0x10\n" "st1 { v7.4s }, [x21], #0x10\n" diff --git a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c index 8bcce742808dbcb70ecd9a2e1c20fe7cc1ccfc49..1bb5d9d4bf2440cf1fc54f75a07c3cb85e944f57 100644 --- a/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c +++ b/kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi4cxp/kai_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm.c @@ -702,13 +702,13 @@ void kai_run_matmul_clamp_f32_qai8dxp4x8_qsi4cxp8x8_8x8x32_neon_i8mm( "mov x23, %x[dst]\n" "cmp x12, #0x1\n" "add x22, x23, %x[dst_stride_row]\n" - "csel x22, x22, x23, GE\n" + "csel x22, x22, x23, GT\n" "cmp x12, #0x2\n" "add x21, x23, %x[dst_stride_row], LSL #1\n" - "csel x21, x21, x22, GE\n" + "csel x21, x21, x22, GT\n" "cmp x12, #0x3\n" "add x20, x21, %x[dst_stride_row]\n" - "csel x20, x20, x21, GE\n" + "csel x20, x20, x21, GT\n" "tbz x25, #2, 19f\n" "st1 { v1.4s }, [x20], #0x10\n" "st1 { v14.4s }, [x21], #0x10\n"