/* twofish-arm.S - ARM assembly implementation of Twofish cipher
*
* Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
*
* This file is part of Libgcrypt.
*
* Libgcrypt is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation; either version 2.1 of
* the License, or (at your option) any later version.
*
* Libgcrypt is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
#include <config.h>
#if defined(__ARMEL__)
#ifdef HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS
.text
.syntax unified
.arm
/* structure of TWOFISH_context: */
#define s0 0
#define s1 ((s0) + 4 * 256)
#define s2 ((s1) + 4 * 256)
#define s3 ((s2) + 4 * 256)
#define w ((s3) + 4 * 256)
#define k ((w) + 4 * 8)
/* register macros */
#define CTX %r0
#define CTXs0 %r0
#define CTXs1 %r1
#define CTXs3 %r7
#define RA %r3
#define RB %r4
#define RC %r5
#define RD %r6
#define RX %r2
#define RY %ip
#define RMASK %lr
#define RT0 %r8
#define RT1 %r9
#define RT2 %r10
#define RT3 %r11
/* helper macros */
#define ldr_unaligned_le(rout, rsrc, offs, rtmp) \
ldrb rout, [rsrc, #((offs) + 0)]; \
ldrb rtmp, [rsrc, #((offs) + 1)]; \
orr rout, rout, rtmp, lsl #8; \
ldrb rtmp, [rsrc, #((offs) + 2)]; \
orr rout, rout, rtmp, lsl #16; \
ldrb rtmp, [rsrc, #((offs) + 3)]; \
orr rout, rout, rtmp, lsl #24;
#define str_unaligned_le(rin, rdst, offs, rtmp0, rtmp1) \
mov rtmp0, rin, lsr #8; \
strb rin, [rdst, #((offs) + 0)]; \
mov rtmp1, rin, lsr #16; \
strb rtmp0, [rdst, #((offs) + 1)]; \
mov rtmp0, rin, lsr #24; \
strb rtmp1, [rdst, #((offs) + 2)]; \
strb rtmp0, [rdst, #((offs) + 3)];
#ifndef __ARMEL__
/* bswap on big-endian */
#define host_to_le(reg) \
rev reg, reg;
#define le_to_host(reg) \
rev reg, reg;
#else
/* nop on little-endian */
#define host_to_le(reg) /*_*/
#define le_to_host(reg) /*_*/
#endif
#define ldr_input_aligned_le(rin, a, b, c, d) \
ldr a, [rin, #0]; \
ldr b, [rin, #4]; \
le_to_host(a); \
ldr c, [rin, #8]; \
le_to_host(b); \
ldr d, [rin, #12]; \
le_to_host(c); \
le_to_host(d);
#define str_output_aligned_le(rout, a, b, c, d) \
le_to_host(a); \
le_to_host(b); \
str a, [rout, #0]; \
le_to_host(c); \
str b, [rout, #4]; \
le_to_host(d); \
str c, [rout, #8]; \
str d, [rout, #12];
#ifdef __ARM_FEATURE_UNALIGNED
/* unaligned word reads/writes allowed */
#define ldr_input_le(rin, ra, rb, rc, rd, rtmp) \
ldr_input_aligned_le(rin, ra, rb, rc, rd)
#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
str_output_aligned_le(rout, ra, rb, rc, rd)
#else
/* need to handle unaligned reads/writes by byte reads */
#define ldr_input_le(rin, ra, rb, rc, rd, rtmp0) \
tst rin, #3; \
beq 1f; \
ldr_unaligned_le(ra, rin, 0, rtmp0); \
ldr_unaligned_le(rb, rin, 4, rtmp0); \
ldr_unaligned_le(rc, rin, 8, rtmp0); \
ldr_unaligned_le(rd, rin, 12, rtmp0); \
b 2f; \
1:;\
ldr_input_aligned_le(rin, ra, rb, rc, rd); \
2:;
#define str_output_le(rout, ra, rb, rc, rd, rtmp0, rtmp1) \
tst rout, #3; \
beq 1f; \
str_unaligned_le(ra, rout, 0, rtmp0, rtmp1); \
str_unaligned_le(rb, rout, 4, rtmp0, rtmp1); \
str_unaligned_le(rc, rout, 8, rtmp0, rtmp1); \
str_unaligned_le(rd, rout, 12, rtmp0, rtmp1); \
b 2f; \
1:;\
str_output_aligned_le(rout, ra, rb, rc, rd); \
2:;
#endif
/**********************************************************************
1-way twofish
**********************************************************************/
#define encrypt_round(a, b, rc, rd, n, ror_a, adj_a) \
and RT0, RMASK, b, lsr#(8 - 2); \
and RY, RMASK, b, lsr#(16 - 2); \
add RT0, RT0, #(s2 - s1); \
and RT1, RMASK, b, lsr#(24 - 2); \
ldr RY, [CTXs3, RY]; \
and RT2, RMASK, b, lsl#(2); \
ldr RT0, [CTXs1, RT0]; \
and RT3, RMASK, a, lsr#(16 - 2 + (adj_a)); \
ldr RT1, [CTXs0, RT1]; \
and RX, RMASK, a, lsr#(8 - 2 + (adj_a)); \
ldr RT2, [CTXs1, RT2]; \
add RT3, RT3, #(s2 - s1); \
ldr RX, [CTXs1, RX]; \
ror_a(a); \
\
eor RY, RY, RT0; \
ldr RT3, [CTXs1, RT3]; \
and RT0, RMASK, a, lsl#(2); \
eor RY, RY, RT1; \
and RT1, RMASK, a, lsr#(24 - 2); \
eor RY, RY, RT2; \
ldr RT0, [CTXs0, RT0]; \
eor RX, RX, RT3; \
ldr RT1, [CTXs3, RT1]; \
eor RX, RX, RT0; \
\
ldr RT3, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
eor RX, RX, RT1; \
ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
\
add RT0, RX, RY, lsl #1; \
add RX, RX, RY; \
add RT0, RT0, RT3; \
add RX, RX, RT2; \
eor rd, RT0, rd, ror #31; \
eor rc, rc, RX;
#define dummy(x) /*_*/
#define ror1(r) \
ror r, r, #1;
#define decrypt_round(a, b, rc, rd, n, ror_b, adj_b) \
and RT3, RMASK, b, lsl#(2 - (adj_b)); \
and RT1, RMASK, b, lsr#(8 - 2 + (adj_b)); \
ror_b(b); \
and RT2, RMASK, a, lsl#(2); \
and RT0, RMASK, a, lsr#(8 - 2); \
\
ldr RY, [CTXs1, RT3]; \
add RT1, RT1, #(s2 - s1); \
ldr RX, [CTXs0, RT2]; \
and RT3, RMASK, b, lsr#(16 - 2); \
ldr RT1, [CTXs1, RT1]; \
and RT2, RMASK, a, lsr#(16 - 2); \
ldr RT0, [CTXs1, RT0]; \
\
add RT2, RT2, #(s2 - s1); \
ldr RT3, [CTXs3, RT3]; \
eor RY, RY, RT1; \
\
and RT1, RMASK, b, lsr#(24 - 2); \
eor RX, RX, RT0; \
ldr RT2, [CTXs1, RT2]; \
and RT0, RMASK, a, lsr#(24 - 2); \
\
ldr RT1, [CTXs0, RT1]; \
\
eor RY, RY, RT3; \
ldr RT0, [CTXs3, RT0]; \
eor RX, RX, RT2; \
eor RY, RY, RT1; \
\
ldr RT1, [CTXs3, #(k - s3 + 8 * (n) + 4)]; \
eor RX, RX, RT0; \
ldr RT2, [CTXs3, #(k - s3 + 8 * (n))]; \
\
add RT0, RX, RY, lsl #1; \
add RX, RX, RY; \
add RT0, RT0, RT1; \
add RX, RX, RT2; \
eor rd, rd, RT0; \
eor rc, RX, rc, ror #31;
#define first_encrypt_cycle(nc) \
encrypt_round(RA, RB, RC, RD, (nc) * 2, dummy, 0); \
encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
#define encrypt_cycle(nc) \
encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1);
#define last_encrypt_cycle(nc) \
encrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
encrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
ror1(RA);
#define first_decrypt_cycle(nc) \
decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, dummy, 0); \
decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
#define decrypt_cycle(nc) \
decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1);
#define last_decrypt_cycle(nc) \
decrypt_round(RC, RD, RA, RB, (nc) * 2 + 1, ror1, 1); \
decrypt_round(RA, RB, RC, RD, (nc) * 2, ror1, 1); \
ror1(RD);
.align 3
.globl _gcry_twofish_arm_encrypt_block
.type _gcry_twofish_arm_encrypt_block,%function;
_gcry_twofish_arm_encrypt_block:
/* input:
* %r0: ctx
* %r1: dst
* %r2: src
*/
push {%r1, %r4-%r11, %ip, %lr};
add RY, CTXs0, #w;
ldr_input_le(%r2, RA, RB, RC, RD, RT0);
/* Input whitening */
ldm RY, {RT0, RT1, RT2, RT3};
add CTXs3, CTXs0, #(s3 - s0);
add CTXs1, CTXs0, #(s1 - s0);
mov RMASK, #(0xff << 2);
eor RA, RA, RT0;
eor RB, RB, RT1;
eor RC, RC, RT2;
eor RD, RD, RT3;
first_encrypt_cycle(0);
encrypt_cycle(1);
encrypt_cycle(2);
encrypt_cycle(3);
encrypt_cycle(4);
encrypt_cycle(5);
encrypt_cycle(6);
last_encrypt_cycle(7);
add RY, CTXs3, #(w + 4*4 - s3);
pop {%r1}; /* dst */
/* Output whitening */
ldm RY, {RT0, RT1, RT2, RT3};
eor RC, RC, RT0;
eor RD, RD, RT1;
eor RA, RA, RT2;
eor RB, RB, RT3;
str_output_le(%r1, RC, RD, RA, RB, RT0, RT1);
pop {%r4-%r11, %ip, %pc};
.ltorg
.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;
.align 3
.globl _gcry_twofish_arm_decrypt_block
.type _gcry_twofish_arm_decrypt_block,%function;
_gcry_twofish_arm_decrypt_block:
/* input:
* %r0: ctx
* %r1: dst
* %r2: src
*/
push {%r1, %r4-%r11, %ip, %lr};
add CTXs3, CTXs0, #(s3 - s0);
ldr_input_le(%r2, RC, RD, RA, RB, RT0);
add RY, CTXs3, #(w + 4*4 - s3);
add CTXs3, CTXs0, #(s3 - s0);
/* Input whitening */
ldm RY, {RT0, RT1, RT2, RT3};
add CTXs1, CTXs0, #(s1 - s0);
mov RMASK, #(0xff << 2);
eor RC, RC, RT0;
eor RD, RD, RT1;
eor RA, RA, RT2;
eor RB, RB, RT3;
first_decrypt_cycle(7);
decrypt_cycle(6);
decrypt_cycle(5);
decrypt_cycle(4);
decrypt_cycle(3);
decrypt_cycle(2);
decrypt_cycle(1);
last_decrypt_cycle(0);
add RY, CTXs0, #w;
pop {%r1}; /* dst */
/* Output whitening */
ldm RY, {RT0, RT1, RT2, RT3};
eor RA, RA, RT0;
eor RB, RB, RT1;
eor RC, RC, RT2;
eor RD, RD, RT3;
str_output_le(%r1, RA, RB, RC, RD, RT0, RT1);
pop {%r4-%r11, %ip, %pc};
.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;
#endif /*HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS*/
#endif /*__ARMEL__*/