1 files changed, 605 insertions, 0 deletions
diff --git a/libotr/libgcrypt-1.8.7/cipher/cast5-amd64.S b/libotr/libgcrypt-1.8.7/cipher/cast5-amd64.S
new file mode 100644
index 0000000..c04015a
--- /dev/null
+++ b/libotr/libgcrypt-1.8.7/cipher/cast5-amd64.S
@@ -0,0 +1,605 @@
+/* cast5-amd64.S  -  AMD64 assembly implementation of CAST5 cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_CAST5)
+
+#if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__)
+#  define GET_EXTERN_POINTER(name, reg) movabsq $name, reg
+#else
+#  ifdef __code_model_large__
+#    define GET_EXTERN_POINTER(name, reg) \
+	       pushq %r15; \
+	       pushq %r14; \
+	    1: leaq 1b(%rip), reg; \
+	       movabsq $_GLOBAL_OFFSET_TABLE_-1b, %r14; \
+	       movabsq $name@GOT, %r15; \
+	       addq %r14, reg; \
+	       popq %r14; \
+	       movq (reg, %r15), reg; \
+	       popq %r15;
+#  else
+#    define GET_EXTERN_POINTER(name, reg) movq name@GOTPCREL(%rip), reg
+#  endif
+#endif
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+.text
+
+.extern _gcry_cast5_s1to4;
+
+#define s1 0
+#define s2 (s1 + (4 * 256))
+#define s3 (s2 + (4 * 256))
+#define s4 (s3 + (4 * 256))
+
+/* structure of CAST5_context: */
+#define Km 0
+#define Kr (Km + (16 * 4))
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+#define RTAB %r8
+
+#define RLR0 %r9
+#define RLR1 %r10
+#define RLR2 %r11
+#define RLR3 %r12
+
+#define RLR0d %r9d
+#define RLR1d %r10d
+#define RLR2d %r11d
+#define RLR3d %r12d
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %dh
+
+#define RKR %rcx
+#define RKRd %ecx
+#define RKRbl %cl
+
+#define RT0 %rbp
+#define RT1 %rsi
+
+#define RT0d %ebp
+#define RT1d %esi
+
+#define RKM0d %r13d
+#define RKM1d %r14d
+
+/***********************************************************************
+ * 1-way cast5
+ ***********************************************************************/
+#define dummy(x)
+
+#define shr_kr(none) \
+	shrq $8,			RKR;
+
+#define F(km, load_next_kr, op0, op1, op2, op3) \
+	op0 ## l RLR0d,			km ## d; \
+	roll RKRbl,			km ## d; \
+	rorq $32,			RLR0; \
+	movzbl km ## bh,		RT0d; \
+	movzbl km ## bl,		RT1d; \
+	roll $16,			km ## d; \
+	movl s1(RTAB,RT0,4),		RT0d; \
+	op1 ## l s2(RTAB,RT1,4),	RT0d; \
+	load_next_kr(kr_next); \
+	movzbl km ## bh,		RT1d; \
+	movzbl km ## bl,		km ## d; \
+	op2 ## l s3(RTAB,RT1,4),	RT0d; \
+	op3 ## l s4(RTAB,km,4),		RT0d; \
+	xorq RT0,			RLR0;
+
+#define F1(km, load_next_kr) \
+	F(##km, load_next_kr, add, xor, sub, add)
+#define F2(km, load_next_kr) \
+	F(##km, load_next_kr, xor, sub, add, xor)
+#define F3(km, load_next_kr) \
+	F(##km, load_next_kr, sub, add, xor, sub)
+
+#define get_round_km(n, km) \
+	movl Km+4*(n)(CTX), 		km;
+
+#define get_round_kr_enc(n) \
+	movq $0x1010101010101010,	RKR; \
+	\
+	/* merge rorl rk and rorl $16 */ \
+	xorq Kr+(n)(CTX),		RKR;
+
+#define get_round_kr_dec(n) \
+	movq $0x1010101010101010,	RKR; \
+	\
+	/* merge rorl rk and rorl $16 */ \
+	xorq Kr+(n - 7)(CTX),		RKR; \
+	bswapq				RKR;
+
+#define round_enc(n, FA, FB, fn1, fn2) \
+	get_round_km(n + 1, RX2d); \
+	FA(RX0, fn1); \
+	get_round_km(n + 2, RX0d); \
+	FB(RX2, fn2);
+
+#define round_enc_last(n, FXA, FXB) \
+	get_round_km(n + 1, RX2d); \
+	\
+	FXA(RX0, shr_kr); \
+	FXB(RX2, dummy);
+
+#define round_enc_1(n, FA, FB) \
+	round_enc(n, FA, FB, shr_kr, shr_kr)
+
+#define round_enc_2(n, FA, FB) \
+	round_enc(n, FA, FB, shr_kr, dummy)
+
+#define round_dec(n, FA, FB, fn1, fn2) \
+	get_round_km(n - 1, RX2d); \
+	FA(RX0, fn1); \
+	get_round_km(n - 2, RX0d); \
+	FB(RX2, fn2);
+
+#define round_dec_last(n, FXA, FXB) \
+	get_round_km(n - 1, RX2d); \
+	FXA(RX0, shr_kr); \
+	FXB(RX2, dummy);
+
+#define round_dec_1(n, FA, FB) \
+	round_dec(n, FA, FB, shr_kr, shr_kr)
+
+#define round_dec_2(n, FA, FB) \
+	round_dec(n, FA, FB, shr_kr, dummy)
+
+#define read_block() \
+	movq (RIO), 		RLR0; \
+	bswapq 			RLR0;
+
+#define write_block() \
+	bswapq 			RLR0; \
+	rorq $32,		RLR0; \
+	movq RLR0, 		(RIO);
+
+.align 8
+.globl _gcry_cast5_amd64_encrypt_block
+ELF(.type   _gcry_cast5_amd64_encrypt_block,@function;)
+
+_gcry_cast5_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	pushq %rbp;
+	pushq %rbx;
+
+	movq %rsi, %r10;
+
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	movq %rdx, RIO;
+	read_block();
+
+	get_round_km(0, RX0d);
+	get_round_kr_enc(0);
+	round_enc_1(0, F1, F2);
+	round_enc_1(2, F3, F1);
+	round_enc_1(4, F2, F3);
+	round_enc_2(6, F1, F2);
+	get_round_kr_enc(8);
+	round_enc_1(8, F3, F1);
+	round_enc_1(10, F2, F3);
+	round_enc_1(12, F1, F2);
+	round_enc_last(14, F3, F1);
+
+	movq %r10, RIO;
+	write_block();
+
+	popq %rbx;
+	popq %rbp;
+	ret;
+ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;)
+
+.align 8
+.globl _gcry_cast5_amd64_decrypt_block
+ELF(.type   _gcry_cast5_amd64_decrypt_block,@function;)
+
+_gcry_cast5_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	pushq %rbp;
+	pushq %rbx;
+
+	movq %rsi, %r10;
+
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	movq %rdx, RIO;
+	read_block();
+
+	get_round_km(15, RX0d);
+	get_round_kr_dec(15);
+	round_dec_1(15, F1, F3);
+	round_dec_1(13, F2, F1);
+	round_dec_1(11, F3, F2);
+	round_dec_2(9, F1, F3);
+	get_round_kr_dec(7);
+	round_dec_1(7, F2, F1);
+	round_dec_1(5, F3, F2);
+	round_dec_1(3, F1, F3);
+	round_dec_last(1, F2, F1);
+
+	movq %r10, RIO;
+	write_block();
+
+	popq %rbx;
+	popq %rbp;
+	ret;
+ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;)
+
+/**********************************************************************
+  4-way cast5, four blocks parallel
+ **********************************************************************/
+#define F_tail(rlr, rx, op1, op2, op3) \
+	movzbl rx ## bh,		RT0d; \
+	movzbl rx ## bl,		RT1d; \
+	roll $16,			rx ## d; \
+	movl s1(RTAB,RT0,4),		RT0d; \
+	op1 ## l s2(RTAB,RT1,4),	RT0d; \
+	movzbl rx ## bh,		RT1d; \
+	movzbl rx ## bl,		rx ## d; \
+	op2 ## l s3(RTAB,RT1,4),	RT0d; \
+	op3 ## l s4(RTAB,rx,4),		RT0d; \
+	xorq RT0,			rlr;
+
+#define F4(km, load_next_kr, op0, op1, op2, op3) \
+	movl km,			RX0d; \
+	op0 ## l RLR0d,			RX0d; \
+	roll RKRbl,			RX0d; \
+	rorq $32,			RLR0; \
+	\
+	movl km,			RX1d; \
+	op0 ## l RLR1d,			RX1d; \
+	roll RKRbl,			RX1d; \
+	rorq $32,			RLR1; \
+	\
+	movl km,			RX2d; \
+	op0 ## l RLR2d,			RX2d; \
+	roll RKRbl,			RX2d; \
+	rorq $32,			RLR2; \
+	\
+	F_tail(RLR0, RX0, op1, op2, op3); \
+	F_tail(RLR1, RX1, op1, op2, op3); \
+	F_tail(RLR2, RX2, op1, op2, op3); \
+	\
+	movl km,			RX0d; \
+	op0 ## l RLR3d,			RX0d; \
+	roll RKRbl,			RX0d; \
+	load_next_kr();			\
+	rorq $32,			RLR3; \
+	\
+	F_tail(RLR3, RX0, op1, op2, op3);
+
+#define F4_1(km, load_next_kr) \
+	F4(km, load_next_kr, add, xor, sub, add)
+#define F4_2(km, load_next_kr) \
+	F4(km, load_next_kr, xor, sub, add, xor)
+#define F4_3(km, load_next_kr) \
+	F4(km, load_next_kr, sub, add, xor, sub)
+
+#define round_enc4(n, FA, FB, fn1, fn2) \
+	get_round_km(n + 1, RKM1d); \
+	FA(RKM0d, fn1); \
+	get_round_km(n + 2, RKM0d); \
+	FB(RKM1d, fn2);
+
+#define round_enc_last4(n, FXA, FXB) \
+	get_round_km(n + 1, RKM1d); \
+	FXA(RKM0d, shr_kr); \
+	FXB(RKM1d, dummy);
+
+#define round_enc4_1(n, FA, FB) \
+	round_enc4(n, FA, FB, shr_kr, shr_kr);
+
+#define round_enc4_2(n, FA, FB) \
+	round_enc4(n, FA, FB, shr_kr, dummy);
+
+#define round_dec4(n, FA, FB, fn1, fn2) \
+	get_round_km(n - 1, RKM1d); \
+	FA(RKM0d, fn1); \
+	get_round_km(n - 2, RKM0d); \
+	FB(RKM1d, fn2);
+
+#define round_dec_last4(n, FXA, FXB) \
+	get_round_km(n - 1, RKM1d); \
+	FXA(RKM0d, shr_kr); \
+	FXB(RKM1d, dummy);
+
+#define round_dec4_1(n, FA, FB) \
+	round_dec4(n, FA, FB, shr_kr, shr_kr);
+
+#define round_dec4_2(n, FA, FB) \
+	round_dec4(n, FA, FB, shr_kr, dummy);
+
+#define inbswap_block4(a, b, c, d) \
+	bswapq 			a; \
+	bswapq 			b; \
+	bswapq 			c; \
+	bswapq 			d;
+
+#define outbswap_block4(a, b, c, d) \
+	bswapq 			a; \
+	bswapq 			b; \
+	bswapq 			c; \
+	bswapq 			d; \
+	rorq $32,		a; \
+	rorq $32,		b; \
+	rorq $32,		c; \
+	rorq $32,		d;
+
+.align 8
+ELF(.type   __cast5_enc_blk4,@function;)
+
+__cast5_enc_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RLR0,RLR1,RLR2,RLR3: four input plaintext blocks
+	 * output:
+	 *	RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks
+	 */
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	get_round_km(0, RKM0d);
+	get_round_kr_enc(0);
+	round_enc4_1(0, F4_1, F4_2);
+	round_enc4_1(2, F4_3, F4_1);
+	round_enc4_1(4, F4_2, F4_3);
+	round_enc4_2(6, F4_1, F4_2);
+	get_round_kr_enc(8);
+	round_enc4_1(8, F4_3, F4_1);
+	round_enc4_1(10, F4_2, F4_3);
+	round_enc4_1(12, F4_1, F4_2);
+	round_enc_last4(14, F4_3, F4_1);
+
+	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	ret;
+ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;)
+
+.align 8
+ELF(.type   __cast5_dec_blk4,@function;)
+
+__cast5_dec_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RLR0,RLR1,RLR2,RLR3: four input ciphertext blocks
+	 * output:
+	 *	RLR0,RLR1,RLR2,RLR3: four output plaintext blocks
+	 */
+	GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB);
+
+	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
+
+	get_round_km(15, RKM0d);
+	get_round_kr_dec(15);
+	round_dec4_1(15, F4_1, F4_3);
+	round_dec4_1(13, F4_2, F4_1);
+	round_dec4_1(11, F4_3, F4_2);
+	round_dec4_2(9, F4_1, F4_3);
+	get_round_kr_dec(7);
+	round_dec4_1(7, F4_2, F4_1);
+	round_dec4_1(5, F4_3, F4_2);
+	round_dec4_1(3, F4_1, F4_3);
+	round_dec_last4(1, F4_2, F4_1);
+
+	outbswap_block4(RLR0, RLR1, RLR2, RLR3);
+	ret;
+ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;)
+
+.align 8
+.globl _gcry_cast5_amd64_ctr_enc
+ELF(.type   _gcry_cast5_amd64_ctr_enc,@function;)
+_gcry_cast5_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+
+	pushq %rsi;
+	pushq %rdx;
+
+	/* load IV and byteswap */
+	movq (%rcx), RX0;
+	bswapq RX0;
+	movq RX0, RLR0;
+
+	/* construct IVs */
+	leaq 1(RX0), RLR1;
+	leaq 2(RX0), RLR2;
+	leaq 3(RX0), RLR3;
+	leaq 4(RX0), RX0;
+	bswapq RX0;
+
+	/* store new IV */
+	movq RX0, (%rcx);
+
+	call __cast5_enc_blk4;
+
+	popq %r14; /*src*/
+	popq %r13; /*dst*/
+
+	/* XOR key-stream with plaintext */
+	xorq 0 * 8(%r14), RLR0;
+	xorq 1 * 8(%r14), RLR1;
+	xorq 2 * 8(%r14), RLR2;
+	xorq 3 * 8(%r14), RLR3;
+	movq RLR0, 0 * 8(%r13);
+	movq RLR1, 1 * 8(%r13);
+	movq RLR2, 2 * 8(%r13);
+	movq RLR3, 3 * 8(%r13);
+
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret
+ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;)
+
+.align 8
+.globl _gcry_cast5_amd64_cbc_dec
+ELF(.type   _gcry_cast5_amd64_cbc_dec,@function;)
+_gcry_cast5_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+
+	pushq %rcx;
+	pushq %rsi;
+	pushq %rdx;
+
+	/* load input */
+	movq 0 * 8(%rdx), RLR0;
+	movq 1 * 8(%rdx), RLR1;
+	movq 2 * 8(%rdx), RLR2;
+	movq 3 * 8(%rdx), RLR3;
+
+	call __cast5_dec_blk4;
+
+	popq RX0; /*src*/
+	popq RX1; /*dst*/
+	popq RX2; /*iv*/
+
+	movq 3 * 8(RX0), %r14;
+	xorq      (RX2), RLR0;
+	xorq 0 * 8(RX0), RLR1;
+	xorq 1 * 8(RX0), RLR2;
+	xorq 2 * 8(RX0), RLR3;
+	movq %r14, (RX2); /* store new IV */
+
+	movq RLR0, 0 * 8(RX1);
+	movq RLR1, 1 * 8(RX1);
+	movq RLR2, 2 * 8(RX1);
+	movq RLR3, 3 * 8(RX1);
+
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+
+ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;)
+
+.align 8
+.globl _gcry_cast5_amd64_cfb_dec
+ELF(.type   _gcry_cast5_amd64_cfb_dec,@function;)
+_gcry_cast5_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (8 blocks)
+	 *	%rdx: src (8 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+	pushq %r14;
+
+	pushq %rsi;
+	pushq %rdx;
+
+	/* Load input */
+	movq (%rcx), RLR0;
+	movq 0 * 8(%rdx), RLR1;
+	movq 1 * 8(%rdx), RLR2;
+	movq 2 * 8(%rdx), RLR3;
+
+	inbswap_block4(RLR0, RLR1, RLR2, RLR3);
+
+	/* Update IV */
+	movq 3 * 8(%rdx), %rdx;
+	movq %rdx, (%rcx);
+
+	call __cast5_enc_blk4;
+
+	popq %rdx; /*src*/
+	popq %rcx; /*dst*/
+
+	xorq 0 * 8(%rdx), RLR0;
+	xorq 1 * 8(%rdx), RLR1;
+	xorq 2 * 8(%rdx), RLR2;
+	xorq 3 * 8(%rdx), RLR3;
+	movq RLR0, 0 * 8(%rcx);
+	movq RLR1, 1 * 8(%rcx);
+	movq RLR2, 2 * 8(%rcx);
+	movq RLR3, 3 * 8(%rcx);
+
+	popq %r14;
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+
+ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;)
+
+#endif /*defined(USE_CAST5)*/
+#endif /*__x86_64*/