/* SPDX-License-Identifier: GPL-2.0-only */ /* * Accelerated GHASH implementation with ARMv8 ASIMD instructions. * * Copyright (C) 2014 - 2018 Linaro Ltd. */ #include #include SHASH .req v0 SHASH2 .req v1 T1 .req v2 T2 .req v3 XM .req v5 XL .req v6 XH .req v7 IN1 .req v7 k00_16 .req v8 k32_48 .req v9 t3 .req v10 t4 .req v11 t5 .req v12 t6 .req v13 t7 .req v14 t8 .req v15 t9 .req v16 perm1 .req v17 perm2 .req v18 perm3 .req v19 sh1 .req v20 sh2 .req v21 sh3 .req v22 sh4 .req v23 ss1 .req v24 ss2 .req v25 ss3 .req v26 ss4 .req v27 .text .macro __pmull_p8, rq, ad, bd ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 __pmull_p8_\bd \rq, \ad .endm .macro __pmull2_p8, rq, ad, bd tbl t3.16b, {\ad\().16b}, perm1.16b // A1 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 __pmull2_p8_\bd \rq, \ad .endm .macro __pmull_p8_SHASH, rq, ad __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 .endm .macro __pmull_p8_SHASH2, rq, ad __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 .endm .macro __pmull2_p8_SHASH, rq, ad __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 .endm .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 pmull\t t3.8h, t3.\nb, \bd // F = A1*B pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 pmull\t t5.8h, t5.\nb, \bd // H = A2*B pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 pmull\t t7.8h, t7.\nb, \bd // J = A3*B pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 pmull\t \rq\().8h, \ad, \bd // D = A*B eor t3.16b, t3.16b, t4.16b // L = E + F eor t5.16b, t5.16b, t6.16b // M = G + H eor t7.16b, t7.16b, t8.16b // N = I + J uzp1 t4.2d, t3.2d, t5.2d uzp2 t3.2d, t3.2d, t5.2d uzp1 t6.2d, t7.2d, t9.2d uzp2 t7.2d, t7.2d, t9.2d // t3 = (L) (P0 + P1) << 8 // t5 = (M) (P2 + P3) << 16 eor t4.16b, t4.16b, t3.16b and t3.16b, t3.16b, k32_48.16b // t7 = (N) (P4 + P5) << 24 // t9 = (K) (P6 + P7) << 32 eor t6.16b, t6.16b, t7.16b and t7.16b, t7.16b, k00_16.16b eor t4.16b, t4.16b, t3.16b eor t6.16b, t6.16b, t7.16b zip2 t5.2d, t4.2d, t3.2d zip1 t3.2d, t4.2d, t3.2d zip2 t9.2d, t6.2d, t7.2d zip1 t7.2d, t6.2d, t7.2d ext t3.16b, t3.16b, t3.16b, #15 ext t5.16b, t5.16b, t5.16b, #14 ext t7.16b, t7.16b, t7.16b, #13 ext t9.16b, t9.16b, t9.16b, #12 eor t3.16b, t3.16b, t5.16b eor t7.16b, t7.16b, t9.16b eor \rq\().16b, \rq\().16b, t3.16b eor \rq\().16b, \rq\().16b, t7.16b .endm .macro __pmull_pre_p8 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 eor SHASH2.16b, SHASH2.16b, SHASH.16b // k00_16 := 0x0000000000000000_000000000000ffff // k32_48 := 0x00000000ffffffff_0000ffffffffffff movi k32_48.2d, #0xffffffff mov k32_48.h[2], k32_48.h[0] ushr k00_16.2d, k32_48.2d, #32 // prepare the permutation vectors mov_q x5, 0x080f0e0d0c0b0a09 movi T1.8b, #8 dup perm1.2d, x5 eor perm1.16b, perm1.16b, T1.16b ushr perm2.2d, perm1.2d, #8 ushr perm3.2d, perm1.2d, #16 ushr T1.2d, perm1.2d, #24 sli perm2.2d, perm1.2d, #56 sli perm3.2d, perm1.2d, #48 sli T1.2d, perm1.2d, #40 // precompute loop invariants tbl sh1.16b, {SHASH.16b}, perm1.16b tbl sh2.16b, {SHASH.16b}, perm2.16b tbl sh3.16b, {SHASH.16b}, perm3.16b tbl sh4.16b, {SHASH.16b}, T1.16b ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 .endm .macro __pmull_reduce_p8 eor XM.16b, XM.16b, T1.16b mov XL.d[1], XM.d[0] mov XH.d[0], XM.d[1] shl T1.2d, XL.2d, #57 shl T2.2d, XL.2d, #62 eor T2.16b, T2.16b, T1.16b shl T1.2d, XL.2d, #63 eor T2.16b, T2.16b, T1.16b ext T1.16b, XL.16b, XH.16b, #8 eor T2.16b, T2.16b, T1.16b mov XL.d[1], T2.d[0] mov XH.d[0], T2.d[1] ushr T2.2d, XL.2d, #1 eor XH.16b, XH.16b, XL.16b eor XL.16b, XL.16b, T2.16b ushr T2.2d, T2.2d, #6 ushr XL.2d, XL.2d, #1 .endm /* * void pmull_ghash_update_p8(size_t blocks, struct polyval_elem *dg, * const u8 *src, * const struct polyval_elem *h) */ SYM_FUNC_START(pmull_ghash_update_p8) ld1 {SHASH.2d}, [x3] ld1 {XL.2d}, [x1] __pmull_pre_p8 0: ld1 {T1.2d}, [x2], #16 sub x0, x0, #1 /* multiply XL by SHASH in GF(2^128) */ rev64 T1.16b, T1.16b ext T2.16b, XL.16b, XL.16b, #8 ext IN1.16b, T1.16b, T1.16b, #8 eor T1.16b, T1.16b, T2.16b eor XL.16b, XL.16b, IN1.16b __pmull2_p8 XH, XL, SHASH // a1 * b1 eor T1.16b, T1.16b, XL.16b __pmull_p8 XL, XL, SHASH // a0 * b0 __pmull_p8 XM, T1, SHASH2 // (a1 + a0)(b1 + b0) eor T2.16b, XL.16b, XH.16b ext T1.16b, XL.16b, XH.16b, #8 eor XM.16b, XM.16b, T2.16b __pmull_reduce_p8 eor T2.16b, T2.16b, XH.16b eor XL.16b, XL.16b, T2.16b cbnz x0, 0b st1 {XL.2d}, [x1] ret SYM_FUNC_END(pmull_ghash_update_p8)