summary refs log tree commit diff stats
path: root/libotr/libgcrypt-1.8.7/cipher/blowfish-amd64.S
diff options
context:
space:
mode:
Diffstat (limited to 'libotr/libgcrypt-1.8.7/cipher/blowfish-amd64.S')
-rw-r--r--libotr/libgcrypt-1.8.7/cipher/blowfish-amd64.S541
1 files changed, 541 insertions, 0 deletions
diff --git a/libotr/libgcrypt-1.8.7/cipher/blowfish-amd64.S b/libotr/libgcrypt-1.8.7/cipher/blowfish-amd64.S
new file mode 100644
index 0000000..21b63fc
--- /dev/null
+++ b/libotr/libgcrypt-1.8.7/cipher/blowfish-amd64.S
@@ -0,0 +1,541 @@
+/* blowfish-amd64.S  -  AMD64 assembly implementation of Blowfish cipher
+ *
+ * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64
+#include <config.h>
+#if defined(USE_BLOWFISH) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+.text
+
+/* structure of BLOWFISH_context: */
+#define s0	0
+#define s1	((s0) + 256 * 4)
+#define s2	((s1) + 256 * 4)
+#define s3	((s2) + 256 * 4)
+#define p	((s3) + 256 * 4)
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rbp
+#define RT1 %rsi
+#define RT2 %r8
+#define RT3 %r9
+
+#define RT0d %ebp
+#define RT1d %esi
+#define RT2d %r8d
+#define RT3d %r9d
+
+#define RKEY %r10
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F() \
+	movzbl RX0bh,		RT1d; \
+	movzbl RX0bl,		RT3d; \
+	rorq $16,		RX0; \
+	movzbl RX0bh,		RT0d; \
+	movzbl RX0bl,		RT2d; \
+	rorq $16,		RX0; \
+	movl s0(CTX,RT0,4),	RT0d; \
+	addl s1(CTX,RT2,4),	RT0d; \
+	xorl s2(CTX,RT1,4),	RT0d; \
+	addl s3(CTX,RT3,4),	RT0d; \
+	xorq RT0,		RX0;
+
+#define load_roundkey_enc(n) \
+	movq p+4*(n)(CTX), 	RX3;
+
+#define add_roundkey_enc() \
+	xorq RX3, 		RX0;
+
+#define round_enc(n) \
+	add_roundkey_enc(); \
+	load_roundkey_enc(n); \
+	\
+	F(); \
+	F();
+
+#define load_roundkey_dec(n) \
+	movq p+4*(n-1)(CTX),	RX3; \
+	rorq $32,		RX3;
+
+#define add_roundkey_dec() \
+	xorq RX3, 		RX0;
+
+#define round_dec(n) \
+	add_roundkey_dec(); \
+	load_roundkey_dec(n); \
+	\
+	F(); \
+	F();
+
+#define read_block() \
+	movq (RIO), 		RX0; \
+	rorq $32, 		RX0; \
+	bswapq 			RX0;
+
+#define write_block() \
+	bswapq 			RX0; \
+	movq RX0, 		(RIO);
+
+.align 8
+ELF(.type   __blowfish_enc_blk1,@function;)
+
+__blowfish_enc_blk1:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0: input plaintext block
+	 * output:
+	 *	RX0: output plaintext block
+	 */
+	movq %rbp, %r11;
+
+	load_roundkey_enc(0);
+	round_enc(2);
+	round_enc(4);
+	round_enc(6);
+	round_enc(8);
+	round_enc(10);
+	round_enc(12);
+	round_enc(14);
+	round_enc(16);
+	add_roundkey_enc();
+
+	movq %r11, %rbp;
+
+	ret;
+ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_do_encrypt
+ELF(.type   _gcry_blowfish_amd64_do_encrypt,@function;)
+
+_gcry_blowfish_amd64_do_encrypt:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: u32 *ret_xl
+	 *	%rdx: u32 *ret_xr
+	 */
+	movl (%rdx), RX0d;
+	shlq $32, RX0;
+	movl (%rsi), RT3d;
+	movq %rdx, %r10;
+	orq RT3, RX0;
+	movq %rsi, RX2;
+
+	call __blowfish_enc_blk1;
+
+	movl RX0d, (%r10);
+	shrq $32, RX0;
+	movl RX0d, (RX2);
+
+	ret;
+ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_encrypt_block
+ELF(.type   _gcry_blowfish_amd64_encrypt_block,@function;)
+
+_gcry_blowfish_amd64_encrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+
+	movq %rsi, %r10;
+
+	movq %rdx, RIO;
+	read_block();
+
+	call __blowfish_enc_blk1;
+
+	movq %r10, RIO;
+	write_block();
+
+	ret;
+ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_decrypt_block
+ELF(.type   _gcry_blowfish_amd64_decrypt_block,@function;)
+
+_gcry_blowfish_amd64_decrypt_block:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst
+	 *	%rdx: src
+	 */
+	movq %rbp, %r11;
+
+	movq %rsi, %r10;
+	movq %rdx, RIO;
+
+	read_block();
+
+	load_roundkey_dec(17);
+	round_dec(15);
+	round_dec(13);
+	round_dec(11);
+	round_dec(9);
+	round_dec(7);
+	round_dec(5);
+	round_dec(3);
+	round_dec(1);
+	add_roundkey_dec();
+
+	movq %r10, RIO;
+	write_block();
+
+	movq %r11, %rbp;
+
+	ret;
+ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;)
+
+/**********************************************************************
+  4-way blowfish, four blocks parallel
+ **********************************************************************/
+#define F4(x) \
+	movzbl x ## bh,		RT1d; \
+	movzbl x ## bl,		RT3d; \
+	rorq $16,		x; \
+	movzbl x ## bh,		RT0d; \
+	movzbl x ## bl,		RT2d; \
+	rorq $16,		x; \
+	movl s0(CTX,RT0,4),	RT0d; \
+	addl s1(CTX,RT2,4),	RT0d; \
+	xorl s2(CTX,RT1,4),	RT0d; \
+	addl s3(CTX,RT3,4),	RT0d; \
+	xorq RT0,		x;
+
+#define add_preloaded_roundkey4() \
+	xorq RKEY,		RX0; \
+	xorq RKEY,		RX1; \
+	xorq RKEY,		RX2; \
+	xorq RKEY,		RX3;
+
+#define preload_roundkey_enc(n) \
+	movq p+4*(n)(CTX),	RKEY;
+
+#define add_roundkey_enc4(n) \
+	add_preloaded_roundkey4(); \
+	preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+	add_roundkey_enc4(n); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3);
+
+#define preload_roundkey_dec(n) \
+	movq p+4*((n)-1)(CTX),	RKEY; \
+	rorq $32,		RKEY;
+
+#define add_roundkey_dec4(n) \
+	add_preloaded_roundkey4(); \
+	preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+	add_roundkey_dec4(n); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3); \
+	\
+	F4(RX0); \
+	F4(RX1); \
+	F4(RX2); \
+	F4(RX3);
+
+#define inbswap_block4() \
+	rorq $32,		RX0; \
+	bswapq 			RX0; \
+	rorq $32,		RX1; \
+	bswapq 			RX1; \
+	rorq $32,		RX2; \
+	bswapq 			RX2; \
+	rorq $32,		RX3; \
+	bswapq 			RX3;
+
+#define inctrswap_block4() \
+	rorq $32,		RX0; \
+	rorq $32,		RX1; \
+	rorq $32,		RX2; \
+	rorq $32,		RX3;
+
+#define outbswap_block4() \
+	bswapq 			RX0; \
+	bswapq 			RX1; \
+	bswapq 			RX2; \
+	bswapq 			RX3;
+
+.align 8
+ELF(.type   __blowfish_enc_blk4,@function;)
+
+__blowfish_enc_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0,RX1,RX2,RX3: four input inbswapped plaintext blocks
+	 * output:
+	 *	RX0,RX1,RX2,RX3: four output ciphertext blocks
+	 */
+	preload_roundkey_enc(0);
+
+	round_enc4(0);
+	round_enc4(2);
+	round_enc4(4);
+	round_enc4(6);
+	round_enc4(8);
+	round_enc4(10);
+	round_enc4(12);
+	round_enc4(14);
+	add_preloaded_roundkey4();
+
+	outbswap_block4();
+
+	ret;
+ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;)
+
+.align 8
+ELF(.type   __blowfish_dec_blk4,@function;)
+
+__blowfish_dec_blk4:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	RX0,RX1,RX2,RX3: four input ciphertext blocks
+	 * output:
+	 *	RX0,RX1,RX2,RX3: four output plaintext blocks
+	 */
+	preload_roundkey_dec(17);
+
+	inbswap_block4();
+
+	round_dec4(17);
+	round_dec4(15);
+	round_dec4(13);
+	round_dec4(11);
+	round_dec4(9);
+	round_dec4(7);
+	round_dec4(5);
+	round_dec4(3);
+	add_preloaded_roundkey4();
+
+	outbswap_block4();
+
+	ret;
+ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_ctr_enc
+ELF(.type   _gcry_blowfish_amd64_ctr_enc,@function;)
+_gcry_blowfish_amd64_ctr_enc:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (big endian, 64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+
+	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
+	movq %rcx, %r13; /*iv*/
+	movq %rdx, %r12; /*src*/
+	movq %rsi, %r11; /*dst*/
+
+	/* load IV and byteswap */
+	movq (%r13), RT0;
+	bswapq RT0;
+	movq RT0, RX0;
+
+	/* construct IVs */
+	leaq 1(RT0), RX1;
+	leaq 2(RT0), RX2;
+	leaq 3(RT0), RX3;
+	leaq 4(RT0), RT0;
+	bswapq RT0;
+
+	inctrswap_block4();
+
+	/* store new IV */
+	movq RT0, (%r13);
+
+	call __blowfish_enc_blk4;
+
+	/* XOR key-stream with plaintext */
+	xorq 0 * 8(%r12), RX0;
+	xorq 1 * 8(%r12), RX1;
+	xorq 2 * 8(%r12), RX2;
+	xorq 3 * 8(%r12), RX3;
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_cbc_dec
+ELF(.type   _gcry_blowfish_amd64_cbc_dec,@function;)
+_gcry_blowfish_amd64_cbc_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+
+	/* %r11-%r13 are not used by __blowfish_dec_blk4 */
+	movq %rsi, %r11; /*dst*/
+	movq %rdx, %r12; /*src*/
+	movq %rcx, %r13; /*iv*/
+
+	/* load input */
+	movq 0 * 8(%r12), RX0;
+	movq 1 * 8(%r12), RX1;
+	movq 2 * 8(%r12), RX2;
+	movq 3 * 8(%r12), RX3;
+
+	call __blowfish_dec_blk4;
+
+	movq 3 * 8(%r12), RT0;
+	xorq      (%r13), RX0;
+	xorq 0 * 8(%r12), RX1;
+	xorq 1 * 8(%r12), RX2;
+	xorq 2 * 8(%r12), RX3;
+	movq RT0, (%r13); /* store new IV */
+
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+
+	ret;
+ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;)
+
+.align 8
+.globl  _gcry_blowfish_amd64_cfb_dec
+ELF(.type   _gcry_blowfish_amd64_cfb_dec,@function;)
+_gcry_blowfish_amd64_cfb_dec:
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (4 blocks)
+	 *	%rdx: src (4 blocks)
+	 *	%rcx: iv (64bit)
+	 */
+	pushq %rbp;
+	pushq %rbx;
+	pushq %r12;
+	pushq %r13;
+
+	/* %r11-%r13 are not used by __blowfish_enc_blk4 */
+	movq %rcx, %r13; /*iv*/
+	movq %rdx, %r12; /*src*/
+	movq %rsi, %r11; /*dst*/
+
+	/* Load input */
+	movq (%r13), RX0;
+	movq 0 * 8(%r12), RX1;
+	movq 1 * 8(%r12), RX2;
+	movq 2 * 8(%r12), RX3;
+
+	inbswap_block4();
+
+	/* Update IV */
+	movq 3 * 8(%r12), RT0;
+	movq RT0, (%r13);
+
+	call __blowfish_enc_blk4;
+
+	xorq 0 * 8(%r12), RX0;
+	xorq 1 * 8(%r12), RX1;
+	xorq 2 * 8(%r12), RX2;
+	xorq 3 * 8(%r12), RX3;
+	movq RX0, 0 * 8(%r11);
+	movq RX1, 1 * 8(%r11);
+	movq RX2, 2 * 8(%r11);
+	movq RX3, 3 * 8(%r11);
+
+	popq %r13;
+	popq %r12;
+	popq %rbx;
+	popq %rbp;
+	ret;
+ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;)
+
+#endif /*defined(USE_BLOWFISH)*/
+#endif /*__x86_64*/