diff options
Diffstat (limited to 'src/fe-gtk/mmx_cmod.S')
-rw-r--r-- | src/fe-gtk/mmx_cmod.S | 530 |
1 files changed, 530 insertions, 0 deletions
diff --git a/src/fe-gtk/mmx_cmod.S b/src/fe-gtk/mmx_cmod.S new file mode 100644 index 00000000..12e866de --- /dev/null +++ b/src/fe-gtk/mmx_cmod.S @@ -0,0 +1,530 @@ +/* + * Copyright (C) 1997-2001, Michael Jennings + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies of the Software, its documentation and marketing & publicity + * materials, and acknowledgment shall be given in the documentation, materials + * and software packages that this Software was used. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +/* MMX routines for tinting XImages written by Willem Monsuwe <willem@stack.nl> */ + +/* Function calling conventions: + * shade_ximage_xx(void *data, int bpl, int w, int h, int rm, int gm, int bm); + */ + +#define data 8(%ebp) +#define bpl 12(%ebp) +#define w 16(%ebp) +#define h 20(%ebp) +#define rm 24(%ebp) +#define gm 28(%ebp) +#define bm 32(%ebp) + +#ifdef UNDERSCORE_SYMBOLS /* need this to link with msvc */ +#define SHADE_XIMAGE_15 _shade_ximage_15_mmx +#define SHADE_XIMAGE_16 _shade_ximage_16_mmx +#define SHADE_XIMAGE_32 _shade_ximage_32_mmx +#define HAVE_MMX _have_mmx +#else +#define SHADE_XIMAGE_15 shade_ximage_15_mmx +#define SHADE_XIMAGE_16 shade_ximage_16_mmx +#define SHADE_XIMAGE_32 shade_ximage_32_mmx +#define HAVE_MMX have_mmx +#endif + +.globl SHADE_XIMAGE_15 +.globl SHADE_XIMAGE_16 +.globl SHADE_XIMAGE_32 +.globl HAVE_MMX + +.bss +.text +.align 8 + +#define ENTER \ + pushl %ebp ;\ + movl %esp, %ebp ;\ + pushl %ebx ;\ + pushl %ecx ;\ + pushl %edx ;\ + pushl %edi ;\ + pushl %esi ;\ + movl data, %esi ;\ + movl w, %ebx ;\ + movl h, %edx + +#define LEAVE \ +4: ;\ + emms ;\ + popl %esi ;\ + popl %edi ;\ + popl %edx ;\ + popl %ecx ;\ + popl %ebx ;\ + movl %ebp, %esp ;\ + popl %ebp ;\ + ret + + +SHADE_XIMAGE_15: + ENTER + + leal -6(%esi, %ebx, 2), %esi + negl %ebx + jz 5f + + /* Setup multipliers */ + movd rm, %mm5 + movd gm, %mm6 + movd bm, %mm7 + punpcklwd %mm5, %mm5 /* 00 00 00 00 rm rm rm rm */ + punpcklwd %mm6, %mm6 /* 00 00 00 00 gm gm gm gm */ + punpcklwd %mm7, %mm7 /* 00 00 00 00 bm bm bm bm */ + punpckldq %mm5, %mm5 /* rm rm rm rm rm rm rm rm */ + punpckldq %mm6, %mm6 /* gm gm gm gm gm gm gm gm */ + punpckldq %mm7, %mm7 /* bm bm bm bm bm bm bm bm */ + + cmpl $256, rm + jg shade_ximage_15_mmx_saturate + cmpl $256, gm + jg shade_ximage_15_mmx_saturate + cmpl $256, bm + jg shade_ximage_15_mmx_saturate + +1: movl %ebx, %ecx + addl $3, %ecx + jns 3f +2: + movq (%esi, %ecx, 2), %mm0 + + movq %mm0, %mm1 /* rg gb */ + movq %mm0, %mm2 /* rg gb */ + psrlw $5, %mm1 /* 0r rg */ + psrlw $10, %mm0 /* 00 0r */ + psllw $11, %mm2 /* b0 00 */ + psllw $11, %mm1 /* g0 00 */ + psllw $8, %mm0 /* 0r 00 */ + psrlw $3, %mm1 /* 0g 00 */ + psrlw $3, %mm2 /* 0b 00 */ + + pmulhw %mm5, %mm0 /* 00 0r */ + pmulhw %mm6, %mm1 /* 00 0g */ + pmulhw %mm7, %mm2 /* 00 0b */ + + psllw $10, %mm0 /* r0 00 */ + psllw $5, %mm1 /* 0g g0 */ + por %mm2, %mm0 /* r0 0b */ + por %mm1, %mm0 /* rg gb */ + + movq %mm0, (%esi, %ecx, 2) + + addl $4, %ecx + js 2b + jmp 4f +3: + movw (%esi, %ecx, 2), %ax + movd %eax, %mm0 + + movq %mm0, %mm1 /* rg gb */ + movq %mm0, %mm2 /* rg gb */ + psrlw $5, %mm1 /* 0r rg */ + psrlw $10, %mm0 /* 00 0r */ + psllw $11, %mm2 /* b0 00 */ + psllw $11, %mm1 /* g0 00 */ + psllw $8, %mm0 /* 0r 00 */ + psrlw $3, %mm1 /* 0g 00 */ + psrlw $3, %mm2 /* 0b 00 */ + + pmulhw %mm5, %mm0 /* 00 0r */ + pmulhw %mm6, %mm1 /* 00 0g */ + pmulhw %mm7, %mm2 /* 00 0b */ + + psllw $10, %mm0 /* r0 00 */ + psllw $5, %mm1 /* 0g g0 */ + por %mm2, %mm0 /* r0 0b */ + por %mm1, %mm0 /* rg gb */ + + movd %mm0, %eax + movw %ax, (%esi, %ecx, 2) + + incl %ecx +4: + cmpl $2, %ecx + jng 3b + + addl bpl, %esi + decl %edx + jnz 1b +5: + LEAVE + + +shade_ximage_15_mmx_saturate: + + pcmpeqw %mm3, %mm3 + psllw $5, %mm3 /* ff e0 ff e0 ff e0 ff e0 */ + +1: movl %ebx, %ecx + addl $3, %ecx + jns 3f +2: + movq (%esi, %ecx, 2), %mm0 + + movq %mm0, %mm1 /* rg gb */ + movq %mm0, %mm2 /* rg gb */ + psrlw $5, %mm1 /* 0r rg */ + psrlw $10, %mm0 /* 00 0r */ + psllw $11, %mm2 /* b0 00 */ + psllw $11, %mm1 /* g0 00 */ + psllw $8, %mm0 /* 0r 00 */ + psrlw $3, %mm1 /* 0g 00 */ + psrlw $3, %mm2 /* 0b 00 */ + + pmulhw %mm5, %mm0 /* xx xr */ + pmulhw %mm6, %mm1 /* xx xg */ + pmulhw %mm7, %mm2 /* xx xb */ + + /* Saturate upper */ + paddusw %mm3, %mm0 /* ff er */ + paddusw %mm3, %mm1 /* ff eg */ + paddusw %mm3, %mm2 /* ff eb */ + + psubw %mm3, %mm0 /* 00 0r */ + psubw %mm3, %mm1 /* 00 0g */ + psubw %mm3, %mm2 /* 00 0b */ + + psllw $10, %mm0 /* r0 00 */ + psllw $5, %mm1 /* 0g g0 */ + por %mm2, %mm0 /* r0 0b */ + por %mm1, %mm0 /* rg gb */ + + movq %mm0, (%esi, %ecx, 2) + + addl $4, %ecx + js 2b + jmp 4f +3: + movw (%esi, %ecx, 2), %ax + movd %eax, %mm0 + + movq %mm0, %mm1 /* rg gb */ + movq %mm0, %mm2 /* rg gb */ + psrlw $5, %mm1 /* 0r rg */ + psrlw $10, %mm0 /* 00 0r */ + psllw $11, %mm2 /* b0 00 */ + psllw $11, %mm1 /* g0 00 */ + psllw $8, %mm0 /* 0r 00 */ + psrlw $3, %mm1 /* 0g 00 */ + psrlw $3, %mm2 /* 0b 00 */ + + pmulhw %mm5, %mm0 /* xx xr */ + pmulhw %mm6, %mm1 /* xx xg */ + pmulhw %mm7, %mm2 /* xx xb */ + + /* Saturate upper */ + paddusw %mm3, %mm0 /* ff er */ + paddusw %mm3, %mm1 /* ff eg */ + paddusw %mm3, %mm2 /* ff eb */ + + psubw %mm3, %mm0 /* 00 0r */ + psubw %mm3, %mm1 /* 00 0g */ + psubw %mm3, %mm2 /* 00 0b */ + + psllw $10, %mm0 /* r0 00 */ + psllw $5, %mm1 /* 0g g0 */ + por %mm2, %mm0 /* r0 0b */ + por %mm1, %mm0 /* rg gb */ + + movd %mm0, %eax + movw %ax, (%esi, %ecx, 2) + + incl %ecx +4: + cmpl $2, %ecx + jng 3b + + addl bpl, %esi + decl %edx + jnz 1b +5: + LEAVE + + +SHADE_XIMAGE_16: + ENTER + + leal -6(%esi, %ebx, 2), %esi + negl %ebx + jz 5f + + /* Setup multipliers */ + movd rm, %mm5 + movd gm, %mm6 + movd bm, %mm7 + punpcklwd %mm5, %mm5 /* 00 00 00 00 rm rm rm rm */ + punpcklwd %mm6, %mm6 /* 00 00 00 00 gm gm gm gm */ + punpcklwd %mm7, %mm7 /* 00 00 00 00 bm bm bm bm */ + punpckldq %mm5, %mm5 /* rm rm rm rm rm rm rm rm */ + punpckldq %mm6, %mm6 /* gm gm gm gm gm gm gm gm */ + punpckldq %mm7, %mm7 /* bm bm bm bm bm bm bm bm */ + + cmpl $256, rm + jg shade_ximage_16_mmx_saturate + cmpl $256, gm + jg shade_ximage_16_mmx_saturate + cmpl $256, bm + jg shade_ximage_16_mmx_saturate + +1: movl %ebx, %ecx + addl $3, %ecx + jns 3f +2: + movq (%esi, %ecx, 2), %mm0 + + movq %mm0, %mm1 /* rg gb */ + movq %mm0, %mm2 /* rg gb */ + psrlw $5, %mm1 /* 0r rg */ + psrlw $11, %mm0 /* 00 0r */ + psllw $11, %mm2 /* b0 00 */ + psllw $10, %mm1 /* g0 00 */ + psllw $8, %mm0 /* 0r 00 */ + psrlw $2, %mm1 /* 0g 00 */ + psrlw $3, %mm2 /* 0b 00 */ + + pmulhw %mm5, %mm0 /* 00 0r */ + pmulhw %mm6, %mm1 /* 00 0g */ + pmulhw %mm7, %mm2 /* 00 0b */ + + psllw $11, %mm0 /* r0 00 */ + psllw $5, %mm1 /* 0g g0 */ + por %mm2, %mm0 /* r0 0b */ + por %mm1, %mm0 /* rg gb */ + + movq %mm0, (%esi, %ecx, 2) + + addl $4, %ecx + js 2b + jmp 4f +3: + movw (%esi, %ecx, 2), %ax + movd %eax, %mm0 + + movq %mm0, %mm1 /* rg gb */ + movq %mm0, %mm2 /* rg gb */ + psrlw $5, %mm1 /* 0r rg */ + psrlw $11, %mm0 /* 00 0r */ + psllw $11, %mm2 /* b0 00 */ + psllw $10, %mm1 /* g0 00 */ + psllw $8, %mm0 /* 0r 00 */ + psrlw $2, %mm1 /* 0g 00 */ + psrlw $3, %mm2 /* 0b 00 */ + + pmulhw %mm5, %mm0 /* 00 0r */ + pmulhw %mm6, %mm1 /* 00 0g */ + pmulhw %mm7, %mm2 /* 00 0b */ + + psllw $11, %mm0 /* r0 00 */ + psllw $5, %mm1 /* 0g g0 */ + por %mm2, %mm0 /* r0 0b */ + por %mm1, %mm0 /* rg gb */ + + movd %mm0, %eax + movw %ax, (%esi, %ecx, 2) + + incl %ecx +4: + cmpl $2, %ecx + jng 3b + + addl bpl, %esi + decl %edx + jnz 1b +5: + LEAVE + + +shade_ximage_16_mmx_saturate: + + pcmpeqw %mm3, %mm3 + movq %mm3, %mm4 + psllw $5, %mm3 /* ff e0 ff e0 ff e0 ff e0 */ + psllw $6, %mm4 /* ff c0 ff c0 ff c0 ff c0 */ + +1: movl %ebx, %ecx + addl $3, %ecx + jns 3f +2: + movq (%esi, %ecx, 2), %mm0 + + movq %mm0, %mm1 /* rg gb */ + movq %mm0, %mm2 /* rg gb */ + psrlw $5, %mm1 /* 0r rg */ + psrlw $11, %mm0 /* 00 0r */ + psllw $11, %mm2 /* b0 00 */ + psllw $10, %mm1 /* g0 00 */ + psllw $8, %mm0 /* 0r 00 */ + psrlw $2, %mm1 /* 0g 00 */ + psrlw $3, %mm2 /* 0b 00 */ + + pmulhw %mm5, %mm0 /* xx xr */ + pmulhw %mm6, %mm1 /* xx xg */ + pmulhw %mm7, %mm2 /* xx xb */ + + /* Saturate upper */ + paddusw %mm3, %mm0 /* ff er */ + paddusw %mm4, %mm1 /* ff cg */ + paddusw %mm3, %mm2 /* ff eb */ + + psubw %mm4, %mm1 /* 00 0g */ + psubw %mm3, %mm2 /* 00 0b */ + + psllw $11, %mm0 /* r0 00 */ + psllw $5, %mm1 /* 0g g0 */ + por %mm2, %mm0 /* r0 0b */ + por %mm1, %mm0 /* rg gb */ + + movq %mm0, (%esi, %ecx, 2) + + addl $4, %ecx + js 2b + jmp 4f +3: + movw (%esi, %ecx, 2), %ax + movd %eax, %mm0 + + movq %mm0, %mm1 /* rg gb */ + movq %mm0, %mm2 /* rg gb */ + psrlw $5, %mm1 /* 0r rg */ + psrlw $11, %mm0 /* 00 0r */ + psllw $11, %mm2 /* b0 00 */ + psllw $10, %mm1 /* g0 00 */ + psllw $8, %mm0 /* 0r 00 */ + psrlw $2, %mm1 /* 0g 00 */ + psrlw $3, %mm2 /* 0b 00 */ + + pmulhw %mm5, %mm0 /* xx xr */ + pmulhw %mm6, %mm1 /* xx xg */ + pmulhw %mm7, %mm2 /* xx xb */ + + /* Saturate upper */ + paddusw %mm3, %mm0 /* ff er */ + paddusw %mm4, %mm1 /* ff cg */ + paddusw %mm3, %mm2 /* ff eb */ + + psubw %mm4, %mm1 /* 00 0g */ + psubw %mm3, %mm2 /* 00 0b */ + + psllw $11, %mm0 /* r0 00 */ + psllw $5, %mm1 /* 0g g0 */ + por %mm2, %mm0 /* r0 0b */ + por %mm1, %mm0 /* rg gb */ + + movd %mm0, %eax + movw %ax, (%esi, %ecx, 2) + + incl %ecx +4: + cmpl $2, %ecx + jng 3b + + addl bpl, %esi + decl %edx + jnz 1b +5: + LEAVE + + +SHADE_XIMAGE_32: + ENTER + + leal (%esi, %ebx, 4), %esi + negl %ebx + jz 3f + + movd rm, %mm4 + movd gm, %mm5 + movd bm, %mm6 + psllq $32, %mm4 + psllq $16, %mm5 + por %mm6, %mm4 + por %mm5, %mm4 + + pcmpeqw %mm6, %mm6 + psllw $15, %mm6 /* 80 00 80 00 80 00 80 00 */ + movq %mm6, %mm5 + pmulhw %mm4, %mm5 /* Get correction factor */ +1: + movl %ebx, %ecx +2: + movd (%esi, %ecx, 4), %mm1 /* 00 rr gg bb */ + pxor %mm0, %mm0 + punpcklbw %mm1, %mm0 /* 00 00 rr 00 gg 00 bb 00 */ + pxor %mm6, %mm0 /* Flip sign */ + + pmulhw %mm4, %mm0 /* 00 00 xx rr xx gg xx bb */ + psubw %mm5, %mm0 /* Correct range */ + packuswb %mm0, %mm0 /* 00 rr gg bb 00 rr gg bb */ + + movd %mm0, (%esi, %ecx, 4) + + incl %ecx + jnz 2b + + addl bpl, %esi + decl %edx + jnz 1b +3: + LEAVE + + +HAVE_MMX: + push %ebx +/* Check if bit 21 in flags word is writeable */ + pushfl + popl %eax + movl %eax,%ebx + xorl $0x00200000, %eax + pushl %eax + popfl + pushfl + popl %eax + + cmpl %eax, %ebx + je 8f + +/* OK, we have CPUID */ + + movl $1, %eax + cpuid + + test $0x00800000, %edx + jz 8f + + movl $1, %eax /* success, have mmx */ + popl %ebx + ret + +8: + xorl %eax,%eax /* failed, no mmx */ + popl %ebx + ret + +#if defined(__GNUC__) && !defined(_WIN32) +.section .note.GNU-stack, "", @progbits +.previous +#endif |