diff options
Diffstat (limited to 'libotr/libgcrypt-1.8.7/cipher/poly1305-armv7-neon.S')
-rw-r--r-- | libotr/libgcrypt-1.8.7/cipher/poly1305-armv7-neon.S | 742 |
1 files changed, 742 insertions, 0 deletions
diff --git a/libotr/libgcrypt-1.8.7/cipher/poly1305-armv7-neon.S b/libotr/libgcrypt-1.8.7/cipher/poly1305-armv7-neon.S new file mode 100644 index 0000000..b4dc946 --- /dev/null +++ b/libotr/libgcrypt-1.8.7/cipher/poly1305-armv7-neon.S @@ -0,0 +1,742 @@ +/* poly1305-armv7-neon.S - ARMv7/NEON implementation of Poly1305 + * + * Copyright (C) 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include <config.h> + +#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \ + defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_NEON) + +.syntax unified +.fpu neon +.arm + +#ifdef __PIC__ +# define GET_DATA_POINTER(reg, name, rtmp) \ + ldr reg, 1f; \ + ldr rtmp, 2f; \ + b 3f; \ + 1: .word _GLOBAL_OFFSET_TABLE_-(3f+8); \ + 2: .word name(GOT); \ + 3: add reg, pc, reg; \ + ldr reg, [reg, rtmp]; +#else +# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name +#endif + +#define UNALIGNED_LDMIA2(ptr, l0, l1) \ + tst ptr, #3; \ + beq 1f; \ + vpush {d0}; \ + vld1.32 {d0}, [ptr]!; \ + vmov l0, s0; \ + vmov l1, s1; \ + vpop {d0}; \ + b 2f; \ + 1: ldmia ptr!, {l0-l1}; \ + 2: ; + +#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \ + tst ptr, #3; \ + beq 1f; \ + vpush {d0-d1}; \ + vld1.32 {d0-d1}, [ptr]!; \ + vmov l0, s0; \ + vmov l1, s1; \ + vmov l2, s2; \ + vmov l3, s3; \ + vpop {d0-d1}; \ + b 2f; \ + 1: ldmia ptr!, {l0-l3}; \ + 2: ; + +.text + +.p2align 2 +.Lpoly1305_init_constants_neon: +.long 0x3ffff03 +.long 0x3ffc0ff +.long 0x3f03fff +.long 0x00fffff + +.globl _gcry_poly1305_armv7_neon_init_ext +.type _gcry_poly1305_armv7_neon_init_ext,%function; +_gcry_poly1305_armv7_neon_init_ext: +.Lpoly1305_init_ext_neon_local: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #32 + mov r14, #-1 + UNALIGNED_LDMIA4(r1, r2, r3, r4, r5) + GET_DATA_POINTER(r7,.Lpoly1305_init_constants_neon,r8) + mov r6, r2 + mov r8, r2, lsr #26 + mov r9, r3, lsr #20 + mov r10, r4, lsr #14 + mov r11, r5, lsr #8 + orr r8, r8, r3, lsl #6 + orr r9, r9, r4, lsl #12 + orr r10, r10, r5, lsl #18 + ldmia r7, {r2-r5} + and r2, r2, r8 + and r3, r3, r9 + and r4, r4, r10 + and r5, r5, r11 + and r6, r6, 0x3ffffff + stmia r0!, {r2-r6} + eor r8, r8, r8 + str r8, [sp, #24] +.Lpoly1305_init_ext_neon_squareloop: + ldr r8, [sp, #24] + mov r12, #16 + cmp r8, #2 + beq .Lpoly1305_init_ext_neon_donesquaring + cmp r8, #1 + moveq r12, #64 + cmp r14, r12 + bls .Lpoly1305_init_ext_neon_donesquaring + add r8, #1 + str r8, [sp, #24] + mov r6, r6, lsl #1 + mov r2, r2, lsl #1 + umull r7, r8, r3, r3 + umull r9, r10, r6, r4 + umlal r7, r8, r6, r5 + umlal r9, r10, r2, r3 + add r11, r5, r5, lsl #2 + umlal r7, r8, r2, r4 + umlal r9, r10, r5, r11 + str r7, [sp, #16] + str r8, [sp, #20] + mov r2, r2, lsr #1 + mov r5, r5, lsl #1 + str r9, [sp, #8] + str r10, [sp, #12] + umull r7, r8, r2, r2 + umull r9, r10, r6, r2 + add r11, r3, r3, lsl #2 + add r12, r4, r4, lsl #2 + umlal r7, r8, r6, r3 + umlal r9, r10, r5, r11 + umlal r7, r8, r5, r12 + umlal r9, r10, r4, r12 + mov r6, r6, lsr #1 + mov r3, r3, lsl #1 + add r11, r2, r2, lsl #2 + str r7, [sp, #0] + str r8, [sp, #4] + umull r7, r8, r6, r6 + umlal r7, r8, r3, r12 + umlal r7, r8, r5, r11 + and r6, r7, 0x3ffffff + mov r11, r7, lsr #26 + orr r11, r11, r8, lsl #6 + ldr r7, [sp, #0] + ldr r8, [sp, #4] + adds r9, r9, r11 + adc r10, r10, #0 + and r2, r9, 0x3ffffff + mov r11, r9, lsr #26 + orr r11, r11, r10, lsl #6 + ldr r9, [sp, #8] + ldr r10, [sp, #12] + adds r7, r7, r11 + adc r8, r8, #0 + and r3, r7, 0x3ffffff + mov r11, r7, lsr #26 + orr r11, r11, r8, lsl #6 + ldr r7, [sp, #16] + ldr r8, [sp, #20] + adds r9, r9, r11 + adc r10, r10, #0 + and r4, r9, 0x3ffffff + mov r11, r9, lsr #26 + orr r11, r11, r10, lsl #6 + adds r7, r7, r11 + adc r8, r8, #0 + and r5, r7, 0x3ffffff + mov r11, r7, lsr #26 + orr r11, r11, r8, lsl #6 + add r11, r11, r11, lsl #2 + add r6, r6, r11 + mov r11, r6, lsr #26 + and r6, r6, 0x3ffffff + add r2, r2, r11 + stmia r0!, {r2-r6} + b .Lpoly1305_init_ext_neon_squareloop +.Lpoly1305_init_ext_neon_donesquaring: + mov r2, #2 + ldr r14, [sp, #24] + sub r14, r2, r14 + mov r3, r14, lsl #4 + add r3, r3, r14, lsl #2 + add r0, r0, r3 + eor r2, r2, r2 + eor r3, r3, r3 + eor r4, r4, r4 + eor r5, r5, r5 + eor r6, r6, r6 + stmia r0!, {r2-r6} + stmia r0!, {r2-r6} + UNALIGNED_LDMIA4(r1, r2, r3, r4, r5) + stmia r0, {r2-r6} + add sp, sp, #32 + ldmfd sp!, {r4-r11, lr} + mov r0, #(9*4+32) + bx lr +.ltorg +.size _gcry_poly1305_armv7_neon_init_ext,.-_gcry_poly1305_armv7_neon_init_ext; + +.globl _gcry_poly1305_armv7_neon_blocks +.type _gcry_poly1305_armv7_neon_blocks,%function; +_gcry_poly1305_armv7_neon_blocks: +.Lpoly1305_blocks_neon_local: + vmov.i32 q0, #0xffffffff + vmov.i32 d4, #1 + vsubw.u32 q0, q0, d4 + vstmdb sp!, {q4,q5,q6,q7} + stmfd sp!, {r4-r11, lr} + mov r8, sp + and sp, sp, #~63 + sub sp, sp, #192 + str r0, [sp, #108] + str r1, [sp, #112] + str r2, [sp, #116] + str r8, [sp, #120] + mov r3, r0 + mov r0, r1 + mov r1, r2 + mov r2, r3 + ldr r8, [r2, #116] + veor d15, d15, d15 + vorr.i32 d15, #(1 << 24) + tst r8, #2 + beq .Lpoly1305_blocks_neon_skip_shift8 + vshr.u64 d15, #32 +.Lpoly1305_blocks_neon_skip_shift8: + tst r8, #4 + beq .Lpoly1305_blocks_neon_skip_shift16 + veor d15, d15, d15 +.Lpoly1305_blocks_neon_skip_shift16: + vst1.64 d15, [sp, :64] + tst r8, #1 + bne .Lpoly1305_blocks_neon_started + vld1.64 {q0-q1}, [r0]! + vswp d1, d2 + vmovn.i64 d21, q0 + vshrn.i64 d22, q0, #26 + vshrn.u64 d24, q1, #14 + vext.8 d0, d0, d2, #4 + vext.8 d1, d1, d3, #4 + vshr.u64 q1, q1, #32 + vshrn.i64 d23, q0, #20 + vshrn.u64 d25, q1, #8 + vand.i32 d21, #0x03ffffff + vand.i32 q11, #0x03ffffff + vand.i32 q12, #0x03ffffff + orr r8, r8, #1 + sub r1, r1, #32 + str r8, [r2, #116] + vorr d25, d25, d15 + b .Lpoly1305_blocks_neon_setupr20 +.Lpoly1305_blocks_neon_started: + add r9, r2, #60 + vldm r9, {d21-d25} +.Lpoly1305_blocks_neon_setupr20: + vmov.i32 d0, #5 + tst r8, #(8|16) + beq .Lpoly1305_blocks_neon_setupr20_simple + tst r8, #(8) + beq .Lpoly1305_blocks_neon_setupr20_r_1 + mov r9, r2 + add r10, r2, #20 + vld1.64 {q9}, [r9]! + vld1.64 {q8}, [r10]! + vld1.64 {d2}, [r9] + vld1.64 {d20}, [r10] + b .Lpoly1305_blocks_neon_setupr20_hard +.Lpoly1305_blocks_neon_setupr20_r_1: + mov r9, r2 + vmov.i32 d2, #1 + vld1.64 {q8}, [r9]! + veor q9, q9, q9 + vshr.u64 d2, d2, #32 + vld1.64 {d20}, [r9] +.Lpoly1305_blocks_neon_setupr20_hard: + vzip.i32 q8, q9 + vzip.i32 d20, d2 + b .Lpoly1305_blocks_neon_setups20 +.Lpoly1305_blocks_neon_setupr20_simple: + add r9, r2, #20 + vld1.64 {d2-d4}, [r9] + vdup.32 d16, d2[0] + vdup.32 d17, d2[1] + vdup.32 d18, d3[0] + vdup.32 d19, d3[1] + vdup.32 d20, d4[0] +.Lpoly1305_blocks_neon_setups20: + vmul.i32 q13, q8, d0[0] + vmov.i64 q15, 0x00000000ffffffff + vmul.i32 q14, q9, d0[0] + vshr.u64 q15, q15, #6 + cmp r1, #64 + blo .Lpoly1305_blocks_neon_try32 + add r9, sp, #16 + add r10, r2, #40 + add r11, sp, #64 + str r1, [sp, #116] + vld1.64 {d10-d12}, [r10] + vmov d14, d12 + vmul.i32 q6, q5, d0[0] +.Lpoly1305_blocks_neon_mainloop: + UNALIGNED_LDMIA4(r0, r2, r3, r4, r5) + vmull.u32 q0, d25, d12[0] + mov r7, r2, lsr #26 + vmlal.u32 q0, d24, d12[1] + mov r8, r3, lsr #20 + ldr r6, [sp, #0] + vmlal.u32 q0, d23, d13[0] + mov r9, r4, lsr #14 + vmlal.u32 q0, d22, d13[1] + orr r6, r6, r5, lsr #8 + vmlal.u32 q0, d21, d14[0] + orr r3, r7, r3, lsl #6 + vmull.u32 q1, d25, d12[1] + orr r4, r8, r4, lsl #12 + orr r5, r9, r5, lsl #18 + vmlal.u32 q1, d24, d13[0] + UNALIGNED_LDMIA4(r0, r7, r8, r9, r10) + vmlal.u32 q1, d23, d13[1] + mov r1, r7, lsr #26 + vmlal.u32 q1, d22, d14[0] + ldr r11, [sp, #4] + mov r12, r8, lsr #20 + vmlal.u32 q1, d21, d10[0] + mov r14, r9, lsr #14 + vmull.u32 q2, d25, d13[0] + orr r11, r11, r10, lsr #8 + orr r8, r1, r8, lsl #6 + vmlal.u32 q2, d24, d13[1] + orr r9, r12, r9, lsl #12 + vmlal.u32 q2, d23, d14[0] + orr r10, r14, r10, lsl #18 + vmlal.u32 q2, d22, d10[0] + mov r12, r3 + and r2, r2, #0x3ffffff + vmlal.u32 q2, d21, d10[1] + mov r14, r5 + vmull.u32 q3, d25, d13[1] + and r3, r7, #0x3ffffff + vmlal.u32 q3, d24, d14[0] + and r5, r8, #0x3ffffff + vmlal.u32 q3, d23, d10[0] + and r7, r9, #0x3ffffff + vmlal.u32 q3, d22, d10[1] + and r8, r14, #0x3ffffff + vmlal.u32 q3, d21, d11[0] + and r9, r10, #0x3ffffff + add r14, sp, #128 + vmull.u32 q4, d25, d14[0] + mov r10, r6 + vmlal.u32 q4, d24, d10[0] + and r6, r4, #0x3ffffff + vmlal.u32 q4, d23, d10[1] + and r4, r12, #0x3ffffff + vmlal.u32 q4, d22, d11[0] + stm r14, {r2-r11} + vmlal.u32 q4, d21, d11[1] + vld1.64 {d21-d24}, [r14, :256]! + vld1.64 {d25}, [r14, :64] + UNALIGNED_LDMIA4(r0, r2, r3, r4, r5) + vmlal.u32 q0, d25, d26 + mov r7, r2, lsr #26 + vmlal.u32 q0, d24, d27 + ldr r6, [sp, #0] + mov r8, r3, lsr #20 + vmlal.u32 q0, d23, d28 + mov r9, r4, lsr #14 + vmlal.u32 q0, d22, d29 + orr r6, r6, r5, lsr #8 + vmlal.u32 q0, d21, d20 + orr r3, r7, r3, lsl #6 + vmlal.u32 q1, d25, d27 + orr r4, r8, r4, lsl #12 + orr r5, r9, r5, lsl #18 + vmlal.u32 q1, d24, d28 + UNALIGNED_LDMIA4(r0, r7, r8, r9, r10) + vmlal.u32 q1, d23, d29 + mov r1, r7, lsr #26 + vmlal.u32 q1, d22, d20 + ldr r11, [sp, #4] + mov r12, r8, lsr #20 + vmlal.u32 q1, d21, d16 + mov r14, r9, lsr #14 + vmlal.u32 q2, d25, d28 + orr r11, r11, r10, lsr #8 + orr r8, r1, r8, lsl #6 + orr r9, r12, r9, lsl #12 + vmlal.u32 q2, d24, d29 + orr r10, r14, r10, lsl #18 + and r2, r2, #0x3ffffff + mov r12, r3 + vmlal.u32 q2, d23, d20 + mov r14, r5 + vmlal.u32 q2, d22, d16 + and r3, r7, #0x3ffffff + vmlal.u32 q2, d21, d17 + and r5, r8, #0x3ffffff + vmlal.u32 q3, d25, d29 + and r7, r9, #0x3ffffff + vmlal.u32 q3, d24, d20 + and r8, r14, #0x3ffffff + vmlal.u32 q3, d23, d16 + and r9, r10, #0x3ffffff + vmlal.u32 q3, d22, d17 + add r14, sp, #128 + vmlal.u32 q3, d21, d18 + mov r10, r6 + vmlal.u32 q4, d25, d20 + vmlal.u32 q4, d24, d16 + and r6, r4, #0x3ffffff + vmlal.u32 q4, d23, d17 + and r4, r12, #0x3ffffff + vmlal.u32 q4, d22, d18 + stm r14, {r2-r11} + vmlal.u32 q4, d21, d19 + vld1.64 {d21-d24}, [r14, :256]! + vld1.64 {d25}, [r14, :64] + vaddw.u32 q0, q0, d21 + vaddw.u32 q1, q1, d22 + vaddw.u32 q2, q2, d23 + vaddw.u32 q3, q3, d24 + vaddw.u32 q4, q4, d25 + vshr.u64 q11, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q11 + vshr.u64 q12, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q12 + vshr.u64 q11, q1, #26 + vand q1, q1, q15 + vadd.i64 q2, q2, q11 + vshr.u64 q12, q4, #26 + vand q4, q4, q15 + vadd.i64 q0, q0, q12 + vshl.i64 q12, q12, #2 + ldr r1, [sp, #116] + vadd.i64 q0, q0, q12 + vshr.u64 q11, q2, #26 + vand q2, q2, q15 + vadd.i64 q3, q3, q11 + sub r1, #64 + vshr.u64 q12, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q12 + cmp r1, #64 + vshr.u64 q11, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q11 + vmovn.i64 d21, q0 + str r1, [sp, #116] + vmovn.i64 d22, q1 + vmovn.i64 d23, q2 + vmovn.i64 d24, q3 + vmovn.i64 d25, q4 + bhs .Lpoly1305_blocks_neon_mainloop +.Lpoly1305_blocks_neon_try32: + cmp r1, #32 + blo .Lpoly1305_blocks_neon_done + tst r0, r0 + bne .Lpoly1305_blocks_loadm32 + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + veor q3, q3, q3 + veor q4, q4, q4 + b .Lpoly1305_blocks_continue32 +.Lpoly1305_blocks_loadm32: + vld1.64 {q0-q1}, [r0]! + veor q4, q4, q4 + vswp d1, d2 + veor q3, q3, q3 + vtrn.32 q0, q4 + vtrn.32 q1, q3 + vshl.i64 q2, q1, #12 + vshl.i64 q3, q3, #18 + vshl.i64 q1, q4, #6 + vmovl.u32 q4, d15 +.Lpoly1305_blocks_continue32: + vmlal.u32 q0, d25, d26 + vmlal.u32 q0, d24, d27 + vmlal.u32 q0, d23, d28 + vmlal.u32 q0, d22, d29 + vmlal.u32 q0, d21, d20 + vmlal.u32 q1, d25, d27 + vmlal.u32 q1, d24, d28 + vmlal.u32 q1, d23, d29 + vmlal.u32 q1, d22, d20 + vmlal.u32 q1, d21, d16 + vmlal.u32 q2, d25, d28 + vmlal.u32 q2, d24, d29 + vmlal.u32 q2, d23, d20 + vmlal.u32 q2, d22, d16 + vmlal.u32 q2, d21, d17 + vmlal.u32 q3, d25, d29 + vmlal.u32 q3, d24, d20 + vmlal.u32 q3, d23, d16 + vmlal.u32 q3, d22, d17 + vmlal.u32 q3, d21, d18 + vmlal.u32 q4, d25, d20 + vmlal.u32 q4, d24, d16 + vmlal.u32 q4, d23, d17 + vmlal.u32 q4, d22, d18 + vmlal.u32 q4, d21, d19 + vshr.u64 q11, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q11 + vshr.u64 q12, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q12 + vshr.u64 q11, q1, #26 + vand q1, q1, q15 + vadd.i64 q2, q2, q11 + vshr.u64 q12, q4, #26 + vand q4, q4, q15 + vadd.i64 q0, q0, q12 + vshl.i64 q12, q12, #2 + vadd.i64 q0, q0, q12 + vshr.u64 q11, q2, #26 + vand q2, q2, q15 + vadd.i64 q3, q3, q11 + vshr.u64 q12, q0, #26 + vand q0, q0, q15 + vadd.i64 q1, q1, q12 + vshr.u64 q11, q3, #26 + vand q3, q3, q15 + vadd.i64 q4, q4, q11 + vmovn.i64 d21, q0 + vmovn.i64 d22, q1 + vmovn.i64 d23, q2 + vmovn.i64 d24, q3 + vmovn.i64 d25, q4 +.Lpoly1305_blocks_neon_done: + tst r0, r0 + beq .Lpoly1305_blocks_neon_final + ldr r2, [sp, #108] + add r2, r2, #60 + vst1.64 {d21}, [r2]! + vst1.64 {d22-d25}, [r2] + b .Lpoly1305_blocks_neon_leave +.Lpoly1305_blocks_neon_final: + vadd.u32 d10, d0, d1 + vadd.u32 d13, d2, d3 + vadd.u32 d11, d4, d5 + ldr r5, [sp, #108] + vadd.u32 d14, d6, d7 + vadd.u32 d12, d8, d9 + vtrn.32 d10, d13 + vtrn.32 d11, d14 + vst1.64 {d10-d12}, [sp] + ldm sp, {r0-r4} + mov r12, r0, lsr #26 + and r0, r0, #0x3ffffff + add r1, r1, r12 + mov r12, r1, lsr #26 + and r1, r1, #0x3ffffff + add r2, r2, r12 + mov r12, r2, lsr #26 + and r2, r2, #0x3ffffff + add r3, r3, r12 + mov r12, r3, lsr #26 + and r3, r3, #0x3ffffff + add r4, r4, r12 + mov r12, r4, lsr #26 + and r4, r4, #0x3ffffff + add r12, r12, r12, lsl #2 + add r0, r0, r12 + mov r12, r0, lsr #26 + and r0, r0, #0x3ffffff + add r1, r1, r12 + mov r12, r1, lsr #26 + and r1, r1, #0x3ffffff + add r2, r2, r12 + mov r12, r2, lsr #26 + and r2, r2, #0x3ffffff + add r3, r3, r12 + mov r12, r3, lsr #26 + and r3, r3, #0x3ffffff + add r4, r4, r12 + mov r12, r4, lsr #26 + and r4, r4, #0x3ffffff + add r12, r12, r12, lsl #2 + add r0, r0, r12 + mov r12, r0, lsr #26 + and r0, r0, #0x3ffffff + add r1, r1, r12 + add r6, r0, #5 + mov r12, r6, lsr #26 + and r6, r6, #0x3ffffff + add r7, r1, r12 + mov r12, r7, lsr #26 + and r7, r7, #0x3ffffff + add r10, r2, r12 + mov r12, r10, lsr #26 + and r10, r10, #0x3ffffff + add r11, r3, r12 + mov r12, #-(1 << 26) + add r12, r12, r11, lsr #26 + and r11, r11, #0x3ffffff + add r14, r4, r12 + mov r12, r14, lsr #31 + sub r12, #1 + and r6, r6, r12 + and r7, r7, r12 + and r10, r10, r12 + and r11, r11, r12 + and r14, r14, r12 + mvn r12, r12 + and r0, r0, r12 + and r1, r1, r12 + and r2, r2, r12 + and r3, r3, r12 + and r4, r4, r12 + orr r0, r0, r6 + orr r1, r1, r7 + orr r2, r2, r10 + orr r3, r3, r11 + orr r4, r4, r14 + orr r0, r0, r1, lsl #26 + lsr r1, r1, #6 + orr r1, r1, r2, lsl #20 + lsr r2, r2, #12 + orr r2, r2, r3, lsl #14 + lsr r3, r3, #18 + orr r3, r3, r4, lsl #8 + add r5, r5, #60 + stm r5, {r0-r3} +.Lpoly1305_blocks_neon_leave: + sub r0, sp, #8 + ldr sp, [sp, #120] + ldmfd sp!, {r4-r11, lr} + vldm sp!, {q4-q7} + sub r0, sp, r0 + bx lr +.size _gcry_poly1305_armv7_neon_blocks,.-_gcry_poly1305_armv7_neon_blocks; + +.globl _gcry_poly1305_armv7_neon_finish_ext +.type _gcry_poly1305_armv7_neon_finish_ext,%function; +_gcry_poly1305_armv7_neon_finish_ext: +.Lpoly1305_finish_ext_neon_local: + stmfd sp!, {r4-r11, lr} + sub sp, sp, #32 + mov r5, r0 + mov r6, r1 + mov r7, r2 + mov r8, r3 + ands r7, r7, r7 + beq .Lpoly1305_finish_ext_neon_noremaining + mov r9, sp + veor q0, q0, q0 + veor q1, q1, q1 + vst1.64 {q0-q1}, [sp] + tst r7, #16 + beq .Lpoly1305_finish_ext_neon_skip16 + vld1.u64 {q0}, [r1]! + vst1.64 {q0}, [r9]! +.Lpoly1305_finish_ext_neon_skip16: + tst r7, #8 + beq .Lpoly1305_finish_ext_neon_skip8 + UNALIGNED_LDMIA2(r1, r10, r11) + stmia r9!, {r10-r11} +.Lpoly1305_finish_ext_neon_skip8: + tst r7, #4 + beq .Lpoly1305_finish_ext_neon_skip4 + ldr r10, [r1], #4 + str r10, [r9], #4 +.Lpoly1305_finish_ext_neon_skip4: + tst r7, #2 + beq .Lpoly1305_finish_ext_neon_skip2 + ldrh r10, [r1], #2 + strh r10, [r9], #2 +.Lpoly1305_finish_ext_neon_skip2: + tst r7, #1 + beq .Lpoly1305_finish_ext_neon_skip1 + ldrb r10, [r1], #1 + strb r10, [r9], #1 +.Lpoly1305_finish_ext_neon_skip1: + cmp r7, #16 + beq .Lpoly1305_finish_ext_neon_skipfinalbit + mov r10, #1 + strb r10, [r9] +.Lpoly1305_finish_ext_neon_skipfinalbit: + ldr r10, [r5, #116] + orrhs r10, #2 + orrlo r10, #4 + str r10, [r5, #116] + mov r0, r5 + mov r1, sp + mov r2, #32 + bl .Lpoly1305_blocks_neon_local +.Lpoly1305_finish_ext_neon_noremaining: + ldr r10, [r5, #116] + tst r10, #1 + beq .Lpoly1305_finish_ext_neon_notstarted + cmp r7, #0 + beq .Lpoly1305_finish_ext_neon_user2r + cmp r7, #16 + bls .Lpoly1305_finish_ext_neon_user1 +.Lpoly1305_finish_ext_neon_user2r: + orr r10, r10, #8 + b .Lpoly1305_finish_ext_neon_finalblock +.Lpoly1305_finish_ext_neon_user1: + orr r10, r10, #16 +.Lpoly1305_finish_ext_neon_finalblock: + str r10, [r5, #116] + mov r0, r5 + eor r1, r1, r1 + mov r2, #32 + bl .Lpoly1305_blocks_neon_local +.Lpoly1305_finish_ext_neon_notstarted: + add r0, r5, #60 + add r9, r5, #100 + ldm r0, {r0-r3} + ldm r9, {r9-r12} + adds r0, r0, r9 + adcs r1, r1, r10 + adcs r2, r2, r11 + adcs r3, r3, r12 + stm r8, {r0-r3} + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + veor q3, q3, q3 + vstmia r5!, {q0-q3} + vstm r5, {q0-q3} + add sp, sp, #32 + ldmfd sp!, {r4-r11, lr} + mov r0, #(9*4+32) + bx lr +.size _gcry_poly1305_armv7_neon_finish_ext,.-_gcry_poly1305_armv7_neon_finish_ext; + +#endif |