From 0e752a6e215aee21dc73da097c3225495d54a5b6 Mon Sep 17 00:00:00 2001 From: SoniEx2 Date: Fri, 9 Apr 2021 07:19:03 -0300 Subject: Add libotr/etc sources --- .../libgcrypt-1.8.7/cipher/poly1305-sse2-amd64.S | 1043 ++++++++++++++++++++ 1 file changed, 1043 insertions(+) create mode 100644 libotr/libgcrypt-1.8.7/cipher/poly1305-sse2-amd64.S (limited to 'libotr/libgcrypt-1.8.7/cipher/poly1305-sse2-amd64.S') diff --git a/libotr/libgcrypt-1.8.7/cipher/poly1305-sse2-amd64.S b/libotr/libgcrypt-1.8.7/cipher/poly1305-sse2-amd64.S new file mode 100644 index 0000000..219eb07 --- /dev/null +++ b/libotr/libgcrypt-1.8.7/cipher/poly1305-sse2-amd64.S @@ -0,0 +1,1043 @@ +/* poly1305-sse2-amd64.S - AMD64/SSE2 implementation of Poly1305 + * + * Copyright (C) 2014 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Based on public domain implementation by Andrew Moon at + * https://github.com/floodyberry/poly1305-opt + */ + +#include + +#if defined(__x86_64__) && (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) + +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif + + +.text + + +.align 8 +.globl _gcry_poly1305_amd64_sse2_init_ext +ELF(.type _gcry_poly1305_amd64_sse2_init_ext,@function;) +_gcry_poly1305_amd64_sse2_init_ext: +.Lpoly1305_init_ext_x86_local: + xor %edx, %edx + pushq %r12 + pushq %r13 + pushq %r14 + movq %rdx, %r10 + movq $-1, %rcx + testq %r10, %r10 + pxor %xmm0, %xmm0 + movq $0xfffffc0ffff, %r9 + movdqa %xmm0, (%rdi) + cmove %rcx, %r10 + movdqa %xmm0, 16(%rdi) + movq $0xffc0fffffff, %rcx + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + movdqa %xmm0, 64(%rdi) + movq 8(%rsi), %r11 + movq %r11, %r8 + movq (%rsi), %r12 + andq %r12, %rcx + shrq $44, %r12 + shlq $20, %r8 + shrq $24, %r11 + orq %r8, %r12 + movq $0xffffffc0f, %r8 + andq %r9, %r12 + andq %r8, %r11 + movl %ecx, %r8d + andl $67108863, %r8d + movq %rcx, %r9 + movl %r8d, 84(%rdi) + movq %r12, %r8 + shrq $26, %r9 + shlq $18, %r8 + orq %r8, %r9 + movq %r12, %r8 + shrq $8, %r8 + andl $67108863, %r9d + andl $67108863, %r8d + movl %r9d, 92(%rdi) + movq %r12, %r9 + movl %r8d, 100(%rdi) + movq %r11, %r8 + shrq $34, %r9 + shlq $10, %r8 + orq %r8, %r9 + movq %r11, %r8 + shrq $16, %r8 + andl $67108863, %r9d + movl %r9d, 108(%rdi) + cmpq $16, %r10 + movl %r8d, 116(%rdi) + movl 16(%rsi), %r8d + movl %r8d, 124(%rdi) + movl 20(%rsi), %r8d + movl %r8d, 132(%rdi) + movl 24(%rsi), %r8d + movl %r8d, 140(%rdi) + movl 28(%rsi), %esi + movl %esi, 148(%rdi) + jbe .Lpoly1305_init_ext_sse2_done + lea (%r11,%r11,4), %r14 + shlq $2, %r14 + lea (%r12,%r12), %rax + mulq %r14 + movq %rax, %r13 + movq %rcx, %rax + movq %rdx, %r8 + mulq %rcx + addq %rax, %r13 + lea (%rcx,%rcx), %rax + movq %r13, %r9 + adcq %rdx, %r8 + mulq %r12 + shlq $20, %r8 + movq %rax, %rsi + shrq $44, %r9 + movq %r11, %rax + orq %r9, %r8 + movq %rdx, %r9 + mulq %r14 + addq %rax, %rsi + movq %rcx, %rax + adcq %rdx, %r9 + addq %r11, %r11 + mulq %r11 + addq %rsi, %r8 + movq %rax, %r11 + movq %r12, %rax + movq %rdx, %rcx + adcq $0, %r9 + mulq %r12 + addq %rax, %r11 + movq %r8, %rsi + adcq %rdx, %rcx + shlq $20, %r9 + shrq $44, %rsi + orq %rsi, %r9 + movq $0xfffffffffff, %rsi + addq %r11, %r9 + movq %r9, %r12 + adcq $0, %rcx + andq %rsi, %r13 + shlq $22, %rcx + andq %rsi, %r8 + shrq $42, %r12 + orq %r12, %rcx + movq %rsi, %r12 + lea (%rcx,%rcx,4), %rcx + addq %rcx, %r13 + movq %rsi, %rcx + andq %r13, %rcx + shrq $44, %r13 + movq %rcx, %r14 + addq %r13, %r8 + movq $0x3ffffffffff, %r13 + andq %r8, %r12 + andq %r13, %r9 + shrq $44, %r8 + movq %r12, %r11 + addq %r8, %r9 + movq %r12, %rax + movq %r9, %r13 + movl %ecx, %r8d + shrq $26, %r14 + andl $67108863, %r8d + shlq $18, %r11 + shrq $34, %rax + orq %r11, %r14 + shlq $10, %r13 + movq %r12, %r11 + orq %r13, %rax + movq %r9, %r13 + shrq $8, %r11 + shrq $16, %r13 + andl $67108863, %r14d + andl $67108863, %r11d + andl $67108863, %eax + movl %r8d, 88(%rdi) + cmpq $64, %r10 + movl %r8d, 80(%rdi) + movl %r14d, 104(%rdi) + movl %r14d, 96(%rdi) + movl %r11d, 120(%rdi) + movl %r11d, 112(%rdi) + movl %eax, 136(%rdi) + movl %eax, 128(%rdi) + movl %r13d, 152(%rdi) + movl %r13d, 144(%rdi) + jbe .Lpoly1305_init_ext_sse2_done + lea (%r9,%r9,4), %r14 + shlq $2, %r14 + lea (%r12,%r12), %rax + mulq %r14 + movq %rax, %r8 + movq %rcx, %rax + movq %rdx, %r10 + mulq %rcx + addq %rax, %r8 + lea (%rcx,%rcx), %rax + movq %r8, %r11 + adcq %rdx, %r10 + andq %rsi, %r8 + mulq %r12 + shlq $20, %r10 + movq %rax, %r13 + shrq $44, %r11 + movq %r9, %rax + orq %r11, %r10 + movq %rdx, %r11 + mulq %r14 + addq %rax, %r13 + movq %rcx, %rax + adcq %rdx, %r11 + addq %r9, %r9 + mulq %r9 + addq %r13, %r10 + movq %rax, %r9 + movq %r12, %rax + movq %rdx, %rcx + adcq $0, %r11 + mulq %r12 + addq %rax, %r9 + movq %r10, %r13 + adcq %rdx, %rcx + andq %rsi, %r10 + shlq $20, %r11 + shrq $44, %r13 + orq %r13, %r11 + addq %r9, %r11 + movq %rsi, %r9 + movq %r11, %r12 + adcq $0, %rcx + shlq $22, %rcx + shrq $42, %r12 + orq %r12, %rcx + lea (%rcx,%rcx,4), %rcx + addq %rcx, %r8 + andq %r8, %r9 + shrq $44, %r8 + movl %r9d, %eax + addq %r8, %r10 + movq $0x3ffffffffff, %r8 + andq %r10, %rsi + andq %r8, %r11 + shrq $44, %r10 + movq %rsi, %r8 + addq %r10, %r11 + andl $67108863, %eax + shrq $26, %r9 + movq %r11, %r10 + shlq $18, %r8 + shlq $10, %r10 + orq %r8, %r9 + movq %rsi, %r8 + shrq $34, %rsi + andl $67108863, %r9d + shrq $8, %r8 + orq %r10, %rsi + shrq $16, %r11 + andl $67108863, %r8d + andl $67108863, %esi + movl %eax, 168(%rdi) + movl %eax, 160(%rdi) + movl %r9d, 184(%rdi) + movl %r9d, 176(%rdi) + movl %r8d, 200(%rdi) + movl %r8d, 192(%rdi) + movl %esi, 216(%rdi) + movl %esi, 208(%rdi) + movl %r11d, 232(%rdi) + movl %r11d, 224(%rdi) +.Lpoly1305_init_ext_sse2_done: + movq $0, 240(%rdi) + popq %r14 + popq %r13 + popq %r12 + ret +ELF(.size _gcry_poly1305_amd64_sse2_init_ext,.-_gcry_poly1305_amd64_sse2_init_ext;) + + +.align 8 +.globl _gcry_poly1305_amd64_sse2_finish_ext +ELF(.type _gcry_poly1305_amd64_sse2_finish_ext,@function;) +_gcry_poly1305_amd64_sse2_finish_ext: +.Lpoly1305_finish_ext_x86_local: + pushq %rbp + movq %rsp, %rbp + subq $64, %rsp + andq $~63, %rsp + movq %rdx, 32(%rsp) + movq %rcx, 40(%rsp) + andq %rdx, %rdx + jz .Lpoly1305_finish_x86_no_leftover + pxor %xmm0, %xmm0 + movdqa %xmm0, 0+0(%rsp) + movdqa %xmm0, 16+0(%rsp) + leaq 0(%rsp), %r8 + testq $16, %rdx + jz .Lpoly1305_finish_x86_skip16 + movdqu 0(%rsi), %xmm0 + movdqa %xmm0, 0(%r8) + addq $16, %rsi + addq $16, %r8 +.Lpoly1305_finish_x86_skip16: + testq $8, %rdx + jz .Lpoly1305_finish_x86_skip8 + movq 0(%rsi), %rax + movq %rax, 0(%r8) + addq $8, %rsi + addq $8, %r8 +.Lpoly1305_finish_x86_skip8: + testq $4, %rdx + jz .Lpoly1305_finish_x86_skip4 + movl 0(%rsi), %eax + movl %eax, 0(%r8) + addq $4, %rsi + addq $4, %r8 +.Lpoly1305_finish_x86_skip4: + testq $2, %rdx + jz .Lpoly1305_finish_x86_skip2 + movw 0(%rsi), %ax + movw %ax, 0(%r8) + addq $2, %rsi + addq $2, %r8 +.Lpoly1305_finish_x86_skip2: + testq $1, %rdx + jz .Lpoly1305_finish_x86_skip1 + movb 0(%rsi), %al + movb %al, 0(%r8) + addq $1, %r8 +.Lpoly1305_finish_x86_skip1: + cmpq $16, %rdx + je .Lpoly1305_finish_x86_is16 + movb $1, 0(%r8) +.Lpoly1305_finish_x86_is16: + movq $4, %rax + jae .Lpoly1305_finish_x86_16andover + movq $8, %rax +.Lpoly1305_finish_x86_16andover: + orq %rax, 240(%rdi) + leaq 0(%rsp), %rsi + movq $32, %rdx + callq .Lpoly1305_blocks_x86_local +.Lpoly1305_finish_x86_no_leftover: + testq $1, 240(%rdi) + jz .Lpoly1305_finish_x86_not_started + movq 32(%rsp), %rdx + andq %rdx, %rdx + jz .Lpoly1305_finish_x86_r2r + cmpq $16, %rdx + jg .Lpoly1305_finish_x86_r2r + xorl %r10d, %r10d + movl 84(%rdi), %eax + movl 92(%rdi), %ecx + movl 100(%rdi), %edx + movl 108(%rdi), %r8d + movl 116(%rdi), %r9d + movl %eax, 80(%rdi) + movl $1, 8+80(%rdi) + movl %ecx, 96(%rdi) + movl %r10d, 8+96(%rdi) + movl %edx, 112(%rdi) + movl %r10d, 8+112(%rdi) + movl %r8d, 128(%rdi) + movl %r10d, 8+128(%rdi) + movl %r9d, 144(%rdi) + movl %r10d, 8+144(%rdi) + jmp .Lpoly1305_finish_x86_combine +.Lpoly1305_finish_x86_r2r: + movl 84(%rdi), %eax + movl 92(%rdi), %ecx + movl 100(%rdi), %edx + movl 108(%rdi), %r8d + movl 116(%rdi), %r9d + movl %eax, 8+80(%rdi) + movl %ecx, 8+96(%rdi) + movl %edx, 8+112(%rdi) + movl %r8d, 8+128(%rdi) + movl %r9d, 8+144(%rdi) +.Lpoly1305_finish_x86_combine: + xorq %rsi, %rsi + movq $32, %rdx + callq .Lpoly1305_blocks_x86_local +.Lpoly1305_finish_x86_not_started: + movq 0(%rdi), %r8 + movq 8(%rdi), %r9 + movq %r9, %r10 + movq 16(%rdi), %r11 + shlq $44, %r9 + shrq $20, %r10 + shlq $24, %r11 + orq %r9, %r8 + orq %r11, %r10 + pxor %xmm0, %xmm0 + movl 124(%rdi), %eax + movl 132(%rdi), %ecx + movl 140(%rdi), %edx + movl 148(%rdi), %esi + movq 40(%rsp), %r11 + shlq $32, %rcx + shlq $32, %rsi + orq %rcx, %rax + orq %rsi, %rdx + addq %r8, %rax + adcq %r10, %rdx + movq %rax, 0(%r11) + movq %rdx, 8(%r11) + movq %rbp, %rax + subq %rsp, %rax + movq %rbp, %rsp + movdqa %xmm0, 0(%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + movdqa %xmm0, 64(%rdi) + movdqa %xmm0, 80(%rdi) + movdqa %xmm0, 96(%rdi) + movdqa %xmm0, 112(%rdi) + movdqa %xmm0, 128(%rdi) + movdqa %xmm0, 144(%rdi) + movdqa %xmm0, 160(%rdi) + movdqa %xmm0, 176(%rdi) + movdqa %xmm0, 192(%rdi) + movdqa %xmm0, 208(%rdi) + movdqa %xmm0, 224(%rdi) + popq %rbp + addq $8, %rax + ret +ELF(.size _gcry_poly1305_amd64_sse2_finish_ext,.-_gcry_poly1305_amd64_sse2_finish_ext;) + + +.align 8 +.globl _gcry_poly1305_amd64_sse2_blocks +ELF(.type _gcry_poly1305_amd64_sse2_blocks,@function;) +_gcry_poly1305_amd64_sse2_blocks: +.Lpoly1305_blocks_x86_local: + pushq %rbp + movq %rsp, %rbp + pushq %rbx + andq $-64, %rsp + subq $328, %rsp + movq 240(%rdi), %rax + movl $(1<<24), %r8d + movl $((1<<26)-1), %r9d + movd %r8, %xmm0 + movd %r9, %xmm5 + pshufd $0x44, %xmm0, %xmm0 + pshufd $0x44, %xmm5, %xmm5 + testb $4, %al + je .Lpoly1305_blocks_x86_3 + psrldq $8, %xmm0 +.Lpoly1305_blocks_x86_3: + testb $8, %al + je .Lpoly1305_blocks_x86_4 + pxor %xmm0, %xmm0 +.Lpoly1305_blocks_x86_4: + movdqa %xmm0, 168(%rsp) + testb $1, %al + jne .Lpoly1305_blocks_x86_5 + movq 16(%rsi), %xmm0 + movdqa %xmm5, %xmm7 + movdqa %xmm5, %xmm10 + movq (%rsi), %xmm6 + orq $1, %rax + subq $32, %rdx + movq 8(%rsi), %xmm1 + punpcklqdq %xmm0, %xmm6 + movq 24(%rsi), %xmm0 + pand %xmm6, %xmm7 + movdqa %xmm6, %xmm9 + psrlq $52, %xmm6 + addq $32, %rsi + punpcklqdq %xmm0, %xmm1 + movdqa %xmm1, %xmm0 + psrlq $26, %xmm9 + psllq $12, %xmm0 + movq %rax, 240(%rdi) + pand %xmm5, %xmm9 + por %xmm0, %xmm6 + psrlq $40, %xmm1 + pand %xmm6, %xmm10 + por 168(%rsp), %xmm1 + psrlq $26, %xmm6 + pand %xmm5, %xmm6 +.Lpoly1305_blocks_x86_6: + movdqa 80(%rdi), %xmm13 + cmpq $63, %rdx + movl $(5), %r8d + movd %r8, %xmm14 + pshufd $0x44, %xmm14, %xmm14 + movdqa 96(%rdi), %xmm15 + movdqa %xmm13, -8(%rsp) + movdqa 112(%rdi), %xmm0 + movdqa %xmm14, 136(%rsp) + movdqa 128(%rdi), %xmm3 + movdqa %xmm15, 312(%rsp) + pmuludq %xmm14, %xmm15 + movdqa 144(%rdi), %xmm13 + movdqa %xmm0, 232(%rsp) + pmuludq %xmm14, %xmm0 + movdqa %xmm3, 152(%rsp) + pmuludq %xmm14, %xmm3 + movdqa %xmm13, 56(%rsp) + pmuludq %xmm14, %xmm13 + movdqa %xmm15, 40(%rsp) + movdqa %xmm0, -24(%rsp) + movdqa %xmm3, -40(%rsp) + movdqa %xmm13, -56(%rsp) + jbe .Lpoly1305_blocks_x86_7 + movdqa 192(%rdi), %xmm15 + leaq 32(%rsi), %rax + movq %rdx, %rcx + movdqa 176(%rdi), %xmm14 + movdqa %xmm15, %xmm2 + movdqa 208(%rdi), %xmm0 + movdqa %xmm15, 216(%rsp) + movdqa %xmm14, 296(%rsp) + movdqa 224(%rdi), %xmm3 + pmuludq 136(%rsp), %xmm14 + movdqa -24(%rsp), %xmm13 + movdqa %xmm14, 8(%rsp) + pmuludq 136(%rsp), %xmm2 + movdqa -40(%rsp), %xmm14 + movdqa %xmm0, 120(%rsp) + pmuludq 136(%rsp), %xmm0 + movdqa %xmm3, 24(%rsp) + movdqa 160(%rdi), %xmm12 + movdqa %xmm0, %xmm8 + movdqa -56(%rsp), %xmm15 + movdqa %xmm13, 88(%rsp) + pmuludq 136(%rsp), %xmm3 + movdqa %xmm2, 104(%rsp) + movdqa %xmm0, %xmm13 + movdqa -8(%rsp), %xmm11 + movdqa %xmm3, 280(%rsp) + movdqa %xmm2, %xmm3 + movdqa %xmm0, 200(%rsp) + movdqa %xmm14, 184(%rsp) + movdqa %xmm15, 264(%rsp) + jmp .Lpoly1305_blocks_x86_8 +.p2align 6,,63 +.Lpoly1305_blocks_x86_13: + movdqa 200(%rsp), %xmm13 + movdqa %xmm3, %xmm6 + movdqa 200(%rsp), %xmm8 + movdqa 104(%rsp), %xmm3 +.Lpoly1305_blocks_x86_8: + movdqa 8(%rsp), %xmm4 + pmuludq %xmm6, %xmm3 + subq $64, %rcx + pmuludq %xmm10, %xmm8 + movdqa 104(%rsp), %xmm2 + movdqa 200(%rsp), %xmm0 + pmuludq %xmm1, %xmm4 + movdqa 280(%rsp), %xmm15 + pmuludq %xmm6, %xmm13 + movdqa 280(%rsp), %xmm14 + pmuludq %xmm1, %xmm0 + paddq %xmm3, %xmm4 + pmuludq %xmm1, %xmm2 + movdqa 280(%rsp), %xmm3 + paddq %xmm8, %xmm4 + pmuludq %xmm9, %xmm15 + movdqa 280(%rsp), %xmm8 + pmuludq %xmm10, %xmm14 + pmuludq %xmm6, %xmm8 + paddq %xmm13, %xmm2 + movdqa %xmm6, %xmm13 + pmuludq %xmm1, %xmm3 + paddq %xmm15, %xmm4 + movdqa 296(%rsp), %xmm15 + pmuludq %xmm12, %xmm13 + paddq %xmm14, %xmm2 + movdqa %xmm7, %xmm14 + paddq %xmm8, %xmm0 + pmuludq %xmm12, %xmm14 + movdqa %xmm9, %xmm8 + pmuludq 296(%rsp), %xmm6 + pmuludq %xmm12, %xmm8 + movdqa %xmm6, 248(%rsp) + pmuludq %xmm10, %xmm15 + movq -16(%rax), %xmm6 + paddq %xmm13, %xmm3 + movdqa %xmm10, %xmm13 + paddq %xmm14, %xmm4 + movq -8(%rax), %xmm14 + paddq %xmm8, %xmm2 + movq -32(%rax), %xmm8 + pmuludq %xmm12, %xmm13 + paddq %xmm15, %xmm3 + pmuludq %xmm12, %xmm1 + movdqa 216(%rsp), %xmm15 + pmuludq 216(%rsp), %xmm10 + punpcklqdq %xmm6, %xmm8 + movq -24(%rax), %xmm6 + pmuludq %xmm9, %xmm15 + paddq %xmm13, %xmm0 + movdqa 296(%rsp), %xmm13 + paddq 248(%rsp), %xmm1 + punpcklqdq %xmm14, %xmm6 + movdqa 296(%rsp), %xmm14 + pmuludq %xmm9, %xmm13 + pmuludq 120(%rsp), %xmm9 + movdqa %xmm15, 72(%rsp) + paddq %xmm10, %xmm1 + movdqa 216(%rsp), %xmm15 + pmuludq %xmm7, %xmm14 + movdqa %xmm6, %xmm10 + paddq %xmm9, %xmm1 + pmuludq %xmm7, %xmm15 + paddq %xmm13, %xmm0 + paddq 72(%rsp), %xmm3 + movdqa 120(%rsp), %xmm13 + psllq $12, %xmm10 + paddq %xmm14, %xmm2 + movdqa %xmm5, %xmm14 + pand %xmm8, %xmm14 + pmuludq %xmm7, %xmm13 + paddq %xmm15, %xmm0 + movdqa %xmm14, 248(%rsp) + movdqa %xmm8, %xmm14 + psrlq $52, %xmm8 + movdqu (%rax), %xmm9 + por %xmm10, %xmm8 + pmuludq 24(%rsp), %xmm7 + movdqu 16(%rax), %xmm10 + paddq %xmm13, %xmm3 + pxor %xmm13, %xmm13 + movdqa %xmm9, %xmm15 + paddq %xmm7, %xmm1 + movdqa %xmm6, %xmm7 + movdqa %xmm10, -72(%rsp) + punpckldq %xmm10, %xmm15 + movdqa %xmm15, %xmm10 + punpckldq %xmm13, %xmm10 + punpckhdq -72(%rsp), %xmm9 + psrlq $40, %xmm6 + movdqa %xmm10, 72(%rsp) + movdqa %xmm9, %xmm10 + punpckhdq %xmm13, %xmm9 + psllq $18, %xmm9 + paddq 72(%rsp), %xmm4 + addq $64, %rax + paddq %xmm9, %xmm3 + movdqa 40(%rsp), %xmm9 + cmpq $63, %rcx + punpckhdq %xmm13, %xmm15 + psllq $6, %xmm15 + punpckldq %xmm13, %xmm10 + paddq %xmm15, %xmm2 + psllq $12, %xmm10 + por 168(%rsp), %xmm6 + pmuludq %xmm6, %xmm9 + movdqa 88(%rsp), %xmm15 + paddq %xmm10, %xmm0 + movdqa 88(%rsp), %xmm13 + psrlq $14, %xmm7 + pand %xmm5, %xmm8 + movdqa 184(%rsp), %xmm10 + pand %xmm5, %xmm7 + pmuludq %xmm7, %xmm15 + paddq %xmm9, %xmm4 + pmuludq %xmm6, %xmm13 + movdqa 184(%rsp), %xmm9 + paddq 168(%rsp), %xmm1 + pmuludq %xmm7, %xmm10 + pmuludq %xmm6, %xmm9 + paddq %xmm15, %xmm4 + movdqa 184(%rsp), %xmm15 + paddq %xmm13, %xmm2 + psrlq $26, %xmm14 + movdqa 264(%rsp), %xmm13 + paddq %xmm10, %xmm2 + pmuludq %xmm8, %xmm15 + pand %xmm5, %xmm14 + paddq %xmm9, %xmm0 + pmuludq %xmm6, %xmm13 + movdqa 264(%rsp), %xmm9 + movdqa 264(%rsp), %xmm10 + pmuludq %xmm11, %xmm6 + pmuludq %xmm8, %xmm9 + paddq %xmm15, %xmm4 + movdqa 264(%rsp), %xmm15 + pmuludq %xmm14, %xmm10 + paddq %xmm13, %xmm3 + movdqa %xmm7, %xmm13 + pmuludq %xmm7, %xmm15 + paddq %xmm6, %xmm1 + movdqa 312(%rsp), %xmm6 + paddq %xmm9, %xmm2 + pmuludq %xmm11, %xmm13 + movdqa 248(%rsp), %xmm9 + paddq %xmm10, %xmm4 + pmuludq %xmm8, %xmm6 + pmuludq 312(%rsp), %xmm7 + paddq %xmm15, %xmm0 + movdqa %xmm9, %xmm10 + movdqa %xmm14, %xmm15 + pmuludq %xmm11, %xmm10 + paddq %xmm13, %xmm3 + movdqa %xmm8, %xmm13 + pmuludq %xmm11, %xmm13 + paddq %xmm6, %xmm3 + paddq %xmm7, %xmm1 + movdqa 232(%rsp), %xmm6 + pmuludq %xmm11, %xmm15 + pmuludq 232(%rsp), %xmm8 + paddq %xmm10, %xmm4 + paddq %xmm8, %xmm1 + movdqa 312(%rsp), %xmm10 + paddq %xmm13, %xmm0 + pmuludq %xmm14, %xmm6 + movdqa 312(%rsp), %xmm13 + pmuludq %xmm9, %xmm10 + paddq %xmm15, %xmm2 + movdqa 232(%rsp), %xmm7 + pmuludq %xmm14, %xmm13 + pmuludq 152(%rsp), %xmm14 + paddq %xmm14, %xmm1 + pmuludq %xmm9, %xmm7 + paddq %xmm6, %xmm3 + paddq %xmm10, %xmm2 + movdqa 152(%rsp), %xmm10 + paddq %xmm13, %xmm0 + pmuludq %xmm9, %xmm10 + paddq %xmm7, %xmm0 + movdqa %xmm4, %xmm7 + psrlq $26, %xmm7 + pmuludq 56(%rsp), %xmm9 + pand %xmm5, %xmm4 + paddq %xmm7, %xmm2 + paddq %xmm9, %xmm1 + paddq %xmm10, %xmm3 + movdqa %xmm2, %xmm7 + movdqa %xmm2, %xmm9 + movdqa %xmm3, %xmm6 + psrlq $26, %xmm7 + pand %xmm5, %xmm3 + psrlq $26, %xmm6 + paddq %xmm7, %xmm0 + pand %xmm5, %xmm9 + paddq %xmm6, %xmm1 + movdqa %xmm0, %xmm10 + movdqa %xmm1, %xmm6 + pand %xmm5, %xmm10 + pand %xmm5, %xmm1 + psrlq $26, %xmm6 + pmuludq 136(%rsp), %xmm6 + paddq %xmm6, %xmm4 + movdqa %xmm0, %xmm6 + psrlq $26, %xmm6 + movdqa %xmm4, %xmm2 + movdqa %xmm4, %xmm7 + paddq %xmm6, %xmm3 + psrlq $26, %xmm2 + pand %xmm5, %xmm7 + movdqa %xmm3, %xmm0 + paddq %xmm2, %xmm9 + pand %xmm5, %xmm3 + psrlq $26, %xmm0 + paddq %xmm0, %xmm1 + ja .Lpoly1305_blocks_x86_13 + leaq -64(%rdx), %rax + movdqa %xmm3, %xmm6 + andl $63, %edx + andq $-64, %rax + leaq 64(%rsi,%rax), %rsi +.Lpoly1305_blocks_x86_7: + cmpq $31, %rdx + jbe .Lpoly1305_blocks_x86_9 + movdqa -24(%rsp), %xmm13 + movdqa %xmm6, %xmm0 + movdqa %xmm6, %xmm3 + movdqa 40(%rsp), %xmm11 + movdqa %xmm1, %xmm12 + testq %rsi, %rsi + movdqa -40(%rsp), %xmm2 + pmuludq %xmm13, %xmm0 + movdqa %xmm1, %xmm8 + pmuludq %xmm1, %xmm11 + movdqa %xmm10, %xmm4 + movdqa %xmm1, %xmm14 + pmuludq %xmm2, %xmm3 + movdqa %xmm6, %xmm15 + pmuludq %xmm1, %xmm13 + movdqa %xmm7, %xmm1 + pmuludq %xmm2, %xmm12 + paddq %xmm0, %xmm11 + movdqa -56(%rsp), %xmm0 + pmuludq %xmm10, %xmm2 + paddq %xmm3, %xmm13 + pmuludq %xmm0, %xmm4 + movdqa %xmm9, %xmm3 + pmuludq %xmm0, %xmm3 + paddq %xmm2, %xmm11 + pmuludq %xmm0, %xmm8 + movdqa %xmm6, %xmm2 + pmuludq %xmm0, %xmm2 + movdqa -8(%rsp), %xmm0 + paddq %xmm4, %xmm13 + movdqa 312(%rsp), %xmm4 + paddq %xmm3, %xmm11 + pmuludq 312(%rsp), %xmm6 + movdqa 312(%rsp), %xmm3 + pmuludq %xmm0, %xmm1 + paddq %xmm2, %xmm12 + pmuludq %xmm0, %xmm15 + movdqa %xmm9, %xmm2 + pmuludq %xmm0, %xmm2 + pmuludq %xmm7, %xmm3 + paddq %xmm1, %xmm11 + movdqa 232(%rsp), %xmm1 + pmuludq %xmm0, %xmm14 + paddq %xmm15, %xmm8 + pmuludq %xmm10, %xmm0 + paddq %xmm2, %xmm13 + movdqa 312(%rsp), %xmm2 + pmuludq %xmm10, %xmm4 + paddq %xmm3, %xmm13 + movdqa 152(%rsp), %xmm3 + pmuludq %xmm9, %xmm2 + paddq %xmm6, %xmm14 + pmuludq 232(%rsp), %xmm10 + paddq %xmm0, %xmm12 + pmuludq %xmm9, %xmm1 + paddq %xmm10, %xmm14 + movdqa 232(%rsp), %xmm0 + pmuludq %xmm7, %xmm3 + paddq %xmm4, %xmm8 + pmuludq 152(%rsp), %xmm9 + paddq %xmm2, %xmm12 + paddq %xmm9, %xmm14 + pmuludq %xmm7, %xmm0 + paddq %xmm1, %xmm8 + pmuludq 56(%rsp), %xmm7 + paddq %xmm3, %xmm8 + paddq %xmm7, %xmm14 + paddq %xmm0, %xmm12 + je .Lpoly1305_blocks_x86_10 + movdqu (%rsi), %xmm1 + pxor %xmm0, %xmm0 + paddq 168(%rsp), %xmm14 + movdqu 16(%rsi), %xmm2 + movdqa %xmm1, %xmm3 + punpckldq %xmm2, %xmm3 + punpckhdq %xmm2, %xmm1 + movdqa %xmm3, %xmm4 + movdqa %xmm1, %xmm2 + punpckldq %xmm0, %xmm4 + punpckhdq %xmm0, %xmm3 + punpckhdq %xmm0, %xmm1 + punpckldq %xmm0, %xmm2 + movdqa %xmm2, %xmm0 + psllq $6, %xmm3 + paddq %xmm4, %xmm11 + psllq $12, %xmm0 + paddq %xmm3, %xmm13 + psllq $18, %xmm1 + paddq %xmm0, %xmm12 + paddq %xmm1, %xmm8 +.Lpoly1305_blocks_x86_10: + movdqa %xmm11, %xmm9 + movdqa %xmm8, %xmm1 + movdqa %xmm11, %xmm7 + psrlq $26, %xmm9 + movdqa %xmm8, %xmm6 + pand %xmm5, %xmm7 + paddq %xmm13, %xmm9 + psrlq $26, %xmm1 + pand %xmm5, %xmm6 + movdqa %xmm9, %xmm10 + paddq %xmm14, %xmm1 + pand %xmm5, %xmm9 + psrlq $26, %xmm10 + movdqa %xmm1, %xmm0 + pand %xmm5, %xmm1 + paddq %xmm12, %xmm10 + psrlq $26, %xmm0 + pmuludq 136(%rsp), %xmm0 + movdqa %xmm10, %xmm2 + paddq %xmm0, %xmm7 + psrlq $26, %xmm2 + movdqa %xmm7, %xmm0 + pand %xmm5, %xmm10 + paddq %xmm2, %xmm6 + psrlq $26, %xmm0 + pand %xmm5, %xmm7 + movdqa %xmm6, %xmm2 + paddq %xmm0, %xmm9 + pand %xmm5, %xmm6 + psrlq $26, %xmm2 + paddq %xmm2, %xmm1 +.Lpoly1305_blocks_x86_9: + testq %rsi, %rsi + je .Lpoly1305_blocks_x86_11 + movdqa %xmm7, 0(%rdi) + movdqa %xmm9, 16(%rdi) + movdqa %xmm10, 32(%rdi) + movdqa %xmm6, 48(%rdi) + movdqa %xmm1, 64(%rdi) + movq -8(%rbp), %rbx + leave + ret +.Lpoly1305_blocks_x86_5: + movdqa 0(%rdi), %xmm7 + movdqa 16(%rdi), %xmm9 + movdqa 32(%rdi), %xmm10 + movdqa 48(%rdi), %xmm6 + movdqa 64(%rdi), %xmm1 + jmp .Lpoly1305_blocks_x86_6 +.Lpoly1305_blocks_x86_11: + movdqa %xmm7, %xmm0 + movdqa %xmm9, %xmm2 + movdqa %xmm6, %xmm3 + psrldq $8, %xmm0 + movabsq $4398046511103, %rbx + paddq %xmm0, %xmm7 + psrldq $8, %xmm2 + movdqa %xmm10, %xmm0 + movd %xmm7, %edx + paddq %xmm2, %xmm9 + psrldq $8, %xmm0 + movl %edx, %ecx + movd %xmm9, %eax + paddq %xmm0, %xmm10 + shrl $26, %ecx + psrldq $8, %xmm3 + movdqa %xmm1, %xmm0 + addl %ecx, %eax + movd %xmm10, %ecx + paddq %xmm3, %xmm6 + movl %eax, %r9d + shrl $26, %eax + psrldq $8, %xmm0 + addl %ecx, %eax + movd %xmm6, %ecx + paddq %xmm0, %xmm1 + movl %eax, %esi + andl $67108863, %r9d + movd %xmm1, %r10d + shrl $26, %esi + andl $67108863, %eax + andl $67108863, %edx + addl %ecx, %esi + salq $8, %rax + movl %r9d, %ecx + shrl $18, %r9d + movl %esi, %r8d + shrl $26, %esi + andl $67108863, %r8d + addl %r10d, %esi + orq %r9, %rax + salq $16, %rsi + movq %r8, %r9 + shrl $10, %r8d + salq $26, %rcx + orq %r8, %rsi + salq $34, %r9 + orq %rdx, %rcx + movq %rsi, %r8 + shrq $42, %rsi + movabsq $17592186044415, %rdx + orq %r9, %rax + andq %rbx, %r8 + leaq (%rsi,%rsi,4), %rsi + andq %rdx, %rcx + andq %rdx, %rax + movabsq $-4398046511104, %r10 + addq %rsi, %rcx + movq %rcx, %rsi + shrq $44, %rcx + addq %rcx, %rax + andq %rdx, %rsi + movq %rax, %rcx + shrq $44, %rax + addq %r8, %rax + andq %rdx, %rcx + andq %rax, %rbx + shrq $42, %rax + leaq (%rsi,%rax,4), %rsi + addq %rbx, %r10 + addq %rax, %rsi + movq %rsi, %r8 + shrq $44, %rsi + andq %rdx, %r8 + addq %rcx, %rsi + leaq 5(%r8), %r9 + movq %r9, %r11 + andq %rdx, %r9 + shrq $44, %r11 + addq %rsi, %r11 + movq %r11, %rax + andq %r11, %rdx + shrq $44, %rax + addq %rax, %r10 + movq %r10, %rax + shrq $63, %rax + subq $1, %rax + movq %rax, %rcx + andq %rax, %r9 + andq %rax, %rdx + notq %rcx + andq %r10, %rax + andq %rcx, %r8 + andq %rcx, %rsi + andq %rbx, %rcx + orq %r9, %r8 + orq %rdx, %rsi + orq %rax, %rcx + movq %r8, 0(%rdi) + movq %rsi, 8(%rdi) + movq %rcx, 16(%rdi) + movq -8(%rbp), %rbx + movq %rbp, %rax + subq %rsp, %rax + pxor %xmm15, %xmm15 + pxor %xmm7, %xmm7 + pxor %xmm14, %xmm14 + pxor %xmm6, %xmm6 + pxor %xmm13, %xmm13 + pxor %xmm5, %xmm5 + pxor %xmm12, %xmm12 + pxor %xmm4, %xmm4 + leave + addq $8, %rax + pxor %xmm11, %xmm11 + pxor %xmm3, %xmm3 + pxor %xmm10, %xmm10 + pxor %xmm2, %xmm2 + pxor %xmm9, %xmm9 + pxor %xmm1, %xmm1 + pxor %xmm8, %xmm8 + pxor %xmm0, %xmm0 + ret +ELF(.size _gcry_poly1305_amd64_sse2_blocks,.-_gcry_poly1305_amd64_sse2_blocks;) + +#endif -- cgit 1.4.1