Mercurial > SDL_sound_CoreAudio
diff decoders/libmpg123/dct64_sse.S @ 562:7e08477b0fc1
MP3 decoder upgrade work.
Ripped out SMPEG and mpglib support, replaced it with "mpg123.c" and libmpg123.
libmpg123 is a much better version of mpglib, so it should solve all the
problems about MP3's not seeking, or most modern MP3's not playing at all,
etc. Since you no longer have to make a tradeoff with SMPEG for features, and
SMPEG is basically rotting, I removed it from the project.
There is still work to be done with libmpg123...there are MMX, 3DNow, SSE,
Altivec, etc decoders which we don't have enabled at the moment, and the
build system could use some work to make this compile more cleanly, etc.
Still: huge win.
author | Ryan C. Gordon <icculus@icculus.org> |
---|---|
date | Fri, 30 Jan 2009 02:44:47 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/decoders/libmpg123/dct64_sse.S Fri Jan 30 02:44:47 2009 -0500 @@ -0,0 +1,567 @@ +/* + dct64_sse: MMX/SSE optimized dct64 + + copyright 2006-2007 by Zuxy Meng <zuxy.meng@gmail.com> / the mpg123 project - free software under the terms of the LGPL 2.1 + see COPYING and AUTHORS files in distribution or http://mpg123.org + initially written by the mysterious higway for MMX (apparently) + then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec + Both have agreed to distribution under LGPL 2.1 . + + Transformed back into standalone asm, with help of + gcc -S -DHAVE_CONFIG_H -I. -march=pentium3 -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o dct64_sse.{S,c} + + Original comment from MPlayer source follows: +*/ + +/* + * Discrete Cosine Tansform (DCT) for SSE + * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c + * and mp3lib/dct64_MMX.c + */ + +#include "mangle.h" + +#ifndef __APPLE__ + .section .rodata +#else + .data +#endif + ALIGN16 + /* .type nnnn, @object + .size nnnn, 16 */ +nnnn: + .long -2147483648 + .long -2147483648 + .long -2147483648 + .long -2147483648 + ALIGN16 + /* .type ppnn, @object + .size ppnn, 16 */ +ppnn: + .long 0 + .long 0 + .long -2147483648 + .long -2147483648 + ALIGN16 + /* .type pnpn, @object + .size pnpn, 16 */ +pnpn: + .long 0 + .long -2147483648 + .long 0 + .long -2147483648 + ALIGN4 + /* .type one.4748, @object + .size one.4748, 4 */ +one.4748: + .long 1065353216 + + .text + ALIGN16,,15 +.globl ASM_NAME(dct64_sse) + /* .type ASM_NAME(dct64_sse), @function */ +ASM_NAME(dct64_sse): + pushl %ebp + movl %esp, %ebp + /* stack from ebp: 0=ebp 4=back 8=arg0 12=arg1 16=arg2 */ +#define ARG(n) (8+n*4)(%ebp) + andl $-16, %esp /* align the stack at 16 bytes */ + subl $256, %esp /* reserve space for local b1 and b2 */ + pushl %ebx +/* stack from esp: 0=ebx 4...131=b2 132...259=b1 */ +#define B1OFF 132 +#define B2OFF 4 +#define B1(n) (B1OFF+n)(%esp) +#define B2(n) (B2OFF+n)(%esp) + + movl ARG(2), %eax + movl ARG(0), %ecx +/* APP */ +/* for (i = 0; i < 0x20 / 2; i += 4) cycle 1 */ + movaps ASM_NAME(costab_mmxsse), %xmm3 + shufps $27, %xmm3, %xmm3 + MOVUAPS (%eax), %xmm1 + movaps %xmm1, %xmm4 + MOVUAPS 112(%eax), %xmm2 + shufps $27, %xmm4, %xmm4 + movaps %xmm2, %xmm0 + shufps $27, %xmm0, %xmm0 + addps %xmm0, %xmm1 + movaps %xmm1, B1(0) + subps %xmm2, %xmm4 + mulps %xmm3, %xmm4 + movaps %xmm4, B1(112) + +/* NO_APP */ + movl ARG(1), %ebx +/* APP */ +/* for (i = 0; i < 0x20 / 2; i += 4) cycle 2 */ + movaps ASM_NAME(costab_mmxsse)+16, %xmm3 + shufps $27, %xmm3, %xmm3 + MOVUAPS 16(%eax), %xmm1 + movaps %xmm1, %xmm4 + MOVUAPS 96(%eax), %xmm2 + shufps $27, %xmm4, %xmm4 + movaps %xmm2, %xmm0 + shufps $27, %xmm0, %xmm0 + addps %xmm0, %xmm1 + movaps %xmm1, B1(16) + subps %xmm2, %xmm4 + mulps %xmm3, %xmm4 + movaps %xmm4, B1(96) + +/* for (i = 0; i < 0x20 / 2; i += 4) cycle 3 */ + movaps ASM_NAME(costab_mmxsse)+32, %xmm3 + shufps $27, %xmm3, %xmm3 + MOVUAPS 32(%eax), %xmm1 + movaps %xmm1, %xmm4 + MOVUAPS 80(%eax), %xmm2 + shufps $27, %xmm4, %xmm4 + movaps %xmm2, %xmm0 + shufps $27, %xmm0, %xmm0 + addps %xmm0, %xmm1 + movaps %xmm1, B1(32) + subps %xmm2, %xmm4 + mulps %xmm3, %xmm4 + movaps %xmm4, B1(80) + +/* for (i = 0; i < 0x20 / 2; i += 4) cycle 4 */ + movaps ASM_NAME(costab_mmxsse)+48, %xmm3 + shufps $27, %xmm3, %xmm3 + MOVUAPS 48(%eax), %xmm1 + movaps %xmm1, %xmm4 + MOVUAPS 64(%eax), %xmm2 + shufps $27, %xmm4, %xmm4 + movaps %xmm2, %xmm0 + shufps $27, %xmm0, %xmm0 + addps %xmm0, %xmm1 + movaps %xmm1, B1(48) + subps %xmm2, %xmm4 + mulps %xmm3, %xmm4 + movaps %xmm4, B1(64) + + movaps B1(0), %xmm1 + movaps B1(16), %xmm3 + movaps B1(32), %xmm4 + movaps B1(48), %xmm6 + movaps %xmm1, %xmm7 + shufps $27, %xmm7, %xmm7 + movaps %xmm3, %xmm5 + shufps $27, %xmm5, %xmm5 + movaps %xmm4, %xmm2 + shufps $27, %xmm2, %xmm2 + movaps %xmm6, %xmm0 + shufps $27, %xmm0, %xmm0 + addps %xmm0, %xmm1 + movaps %xmm1, B2(0) + addps %xmm2, %xmm3 + movaps %xmm3, B2(16) + subps %xmm4, %xmm5 + movaps %xmm5, B2(32) + subps %xmm6, %xmm7 + movaps %xmm7, B2(48) + + movaps B1(64), %xmm1 + movaps B1(80), %xmm3 + movaps B1(96), %xmm4 + movaps B1(112), %xmm6 + movaps %xmm1, %xmm7 + shufps $27, %xmm7, %xmm7 + movaps %xmm3, %xmm5 + shufps $27, %xmm5, %xmm5 + movaps %xmm4, %xmm2 + shufps $27, %xmm2, %xmm2 + movaps %xmm6, %xmm0 + shufps $27, %xmm0, %xmm0 + addps %xmm0, %xmm1 + movaps %xmm1, B2(64) + addps %xmm2, %xmm3 + movaps %xmm3, B2(80) + subps %xmm4, %xmm5 + movaps %xmm5, B2(96) + subps %xmm6, %xmm7 + movaps %xmm7, B2(112) + + movaps B2(32), %xmm0 + movaps B2(48), %xmm1 + movaps ASM_NAME(costab_mmxsse)+64, %xmm4 + xorps %xmm6, %xmm6 + shufps $27, %xmm4, %xmm4 + mulps %xmm4, %xmm1 + movaps ASM_NAME(costab_mmxsse)+80, %xmm2 + xorps %xmm7, %xmm7 + shufps $27, %xmm2, %xmm2 + mulps %xmm2, %xmm0 + movaps %xmm0, B2(32) + movaps %xmm1, B2(48) + movaps B2(96), %xmm3 + mulps %xmm2, %xmm3 + subps %xmm3, %xmm6 + movaps %xmm6, B2(96) + movaps B2(112), %xmm5 + mulps %xmm4, %xmm5 + subps %xmm5, %xmm7 + movaps %xmm7, B2(112) + + movaps ASM_NAME(costab_mmxsse)+96, %xmm0 + shufps $27, %xmm0, %xmm0 + movaps nnnn, %xmm5 + movaps %xmm5, %xmm6 + + movaps B2(0), %xmm2 + movaps B2(16), %xmm3 + movaps %xmm2, %xmm4 + xorps %xmm5, %xmm6 + shufps $27, %xmm4, %xmm4 + movaps %xmm3, %xmm1 + shufps $27, %xmm1, %xmm1 + addps %xmm1, %xmm2 + movaps %xmm2, B1(0) + subps %xmm3, %xmm4 + xorps %xmm6, %xmm4 + mulps %xmm0, %xmm4 + movaps %xmm4, B1(16) + + movaps B2(32), %xmm2 + movaps B2(48), %xmm3 + movaps %xmm2, %xmm4 + xorps %xmm5, %xmm6 + shufps $27, %xmm4, %xmm4 + movaps %xmm3, %xmm1 + shufps $27, %xmm1, %xmm1 + addps %xmm1, %xmm2 + movaps %xmm2, B1(32) + subps %xmm3, %xmm4 + xorps %xmm6, %xmm4 + mulps %xmm0, %xmm4 + movaps %xmm4, B1(48) + + movaps B2(64), %xmm2 + movaps B2(80), %xmm3 + movaps %xmm2, %xmm4 + xorps %xmm5, %xmm6 + shufps $27, %xmm4, %xmm4 + movaps %xmm3, %xmm1 + shufps $27, %xmm1, %xmm1 + addps %xmm1, %xmm2 + movaps %xmm2, B1(64) + subps %xmm3, %xmm4 + xorps %xmm6, %xmm4 + mulps %xmm0, %xmm4 + movaps %xmm4, B1(80) + + movaps B2(96), %xmm2 + movaps B2(112), %xmm3 + movaps %xmm2, %xmm4 + xorps %xmm5, %xmm6 + shufps $27, %xmm4, %xmm4 + movaps %xmm3, %xmm1 + shufps $27, %xmm1, %xmm1 + addps %xmm1, %xmm2 + movaps %xmm2, B1(96) + subps %xmm3, %xmm4 + xorps %xmm6, %xmm4 + mulps %xmm0, %xmm4 + movaps %xmm4, B1(112) + + movss one.4748, %xmm1 + movss ASM_NAME(costab_mmxsse)+112, %xmm0 + movaps %xmm1, %xmm3 + unpcklps %xmm0, %xmm3 + movss ASM_NAME(costab_mmxsse)+116, %xmm2 + movaps %xmm1, %xmm0 + unpcklps %xmm2, %xmm0 + unpcklps %xmm3, %xmm0 + movaps ppnn, %xmm2 + + movaps B1(0), %xmm3 + movaps %xmm3, %xmm4 + shufps $20, %xmm4, %xmm4 + shufps $235, %xmm3, %xmm3 + xorps %xmm2, %xmm3 + addps %xmm3, %xmm4 + mulps %xmm0, %xmm4 + movaps %xmm4, B2(0) + movaps B1(16), %xmm6 + movaps %xmm6, %xmm5 + shufps $27, %xmm5, %xmm5 + xorps %xmm2, %xmm5 + addps %xmm5, %xmm6 + mulps %xmm0, %xmm6 + movaps %xmm6, B2(16) + + movaps B1(32), %xmm3 + movaps %xmm3, %xmm4 + shufps $20, %xmm4, %xmm4 + shufps $235, %xmm3, %xmm3 + xorps %xmm2, %xmm3 + addps %xmm3, %xmm4 + mulps %xmm0, %xmm4 + movaps %xmm4, B2(32) + movaps B1(48), %xmm6 + movaps %xmm6, %xmm5 + shufps $27, %xmm5, %xmm5 + xorps %xmm2, %xmm5 + addps %xmm5, %xmm6 + mulps %xmm0, %xmm6 + movaps %xmm6, B2(48) + + movaps B1(64), %xmm3 + movaps %xmm3, %xmm4 + shufps $20, %xmm4, %xmm4 + shufps $235, %xmm3, %xmm3 + xorps %xmm2, %xmm3 + addps %xmm3, %xmm4 + mulps %xmm0, %xmm4 + movaps %xmm4, B2(64) + movaps B1(80), %xmm6 + movaps %xmm6, %xmm5 + shufps $27, %xmm5, %xmm5 + xorps %xmm2, %xmm5 + addps %xmm5, %xmm6 + mulps %xmm0, %xmm6 + movaps %xmm6, B2(80) + + movaps B1(96), %xmm3 + movaps %xmm3, %xmm4 + shufps $20, %xmm4, %xmm4 + shufps $235, %xmm3, %xmm3 + xorps %xmm2, %xmm3 + addps %xmm3, %xmm4 + mulps %xmm0, %xmm4 + movaps %xmm4, B2(96) + movaps B1(112), %xmm6 + movaps %xmm6, %xmm5 + shufps $27, %xmm5, %xmm5 + xorps %xmm2, %xmm5 + addps %xmm5, %xmm6 + mulps %xmm0, %xmm6 + movaps %xmm6, B2(112) + + movss ASM_NAME(costab_mmxsse)+120, %xmm0 + movaps %xmm1, %xmm2 + movaps %xmm0, %xmm7 + unpcklps %xmm1, %xmm2 + unpcklps %xmm0, %xmm7 + movaps pnpn, %xmm0 + unpcklps %xmm7, %xmm2 + + movaps B2(32), %xmm1 + movaps %xmm1, %xmm3 + shufps $224, %xmm3, %xmm3 + shufps $181, %xmm1, %xmm1 + xorps %xmm0, %xmm1 + addps %xmm1, %xmm3 + mulps %xmm2, %xmm3 + movaps %xmm3, B1(32) + movaps B2(48), %xmm4 + movaps %xmm4, %xmm5 + shufps $224, %xmm5, %xmm5 + shufps $181, %xmm4, %xmm4 + xorps %xmm0, %xmm4 + addps %xmm4, %xmm5 + mulps %xmm2, %xmm5 + movaps %xmm5, B1(48) + + movaps B2(64), %xmm1 + movaps %xmm1, %xmm3 + shufps $224, %xmm3, %xmm3 + shufps $181, %xmm1, %xmm1 + xorps %xmm0, %xmm1 + addps %xmm1, %xmm3 + mulps %xmm2, %xmm3 + movaps %xmm3, B1(64) + movaps B2(80), %xmm4 + movaps %xmm4, %xmm5 + shufps $224, %xmm5, %xmm5 + shufps $181, %xmm4, %xmm4 + xorps %xmm0, %xmm4 + addps %xmm4, %xmm5 + mulps %xmm2, %xmm5 + movaps %xmm5, B1(80) + + movaps B2(96), %xmm1 + movaps %xmm1, %xmm3 + shufps $224, %xmm3, %xmm3 + shufps $181, %xmm1, %xmm1 + xorps %xmm0, %xmm1 + addps %xmm1, %xmm3 + mulps %xmm2, %xmm3 + movaps %xmm3, B1(96) + movaps B2(112), %xmm4 + movaps %xmm4, %xmm5 + shufps $224, %xmm5, %xmm5 + shufps $181, %xmm4, %xmm4 + xorps %xmm0, %xmm4 + addps %xmm4, %xmm5 + mulps %xmm2, %xmm5 + movaps %xmm5, B1(112) + +/* NO_APP */ + flds B1(40) + movl %esp, %edx + addl $B1OFF, %edx + movl %esp, %eax + addl $B2OFF, %eax + fadds B1(44) + fstps B1(40) + flds B1(56) + fadds B1(60) + flds B1(48) + fadd %st(1), %st + fstps B1(48) + fadds B1(52) + fstps B1(56) + flds B1(52) + fadds B1(60) + fstps B1(52) + flds B1(72) + fadds B1(76) + fstps B1(72) + flds B1(88) + fadds B1(92) + flds B1(80) + fadd %st(1), %st + fstps B1(80) + fadds B1(84) + fstps B1(88) + flds B1(84) + fadds B1(92) + fstps B1(84) + flds B1(104) + fadds B1(108) + fstps B1(104) + flds B1(120) + fadds B1(124) + flds B1(112) + fadd %st(1), %st + fstps B1(112) + fadds B1(116) + fstps B1(120) + flds B1(116) + fadds B1(124) + fstps B1(116) +/* APP */ + flds ASM_NAME(costab_mmxsse)+120 + flds (%eax) + fadds 4(%eax) + fistp 512(%ecx) + flds (%eax) + fsubs 4(%eax) + fmul %st(1) + fistp (%ecx) + flds 12(%eax) + fsubs 8(%eax) + fmul %st(1) + fist 256(%ebx) + fadds 12(%eax) + fadds 8(%eax) + fistp 256(%ecx) + flds 16(%eax) + fsubs 20(%eax) + fmul %st(1) + flds 28(%eax) + fsubs 24(%eax) + fmul %st(2) + fist 384(%ebx) + fld %st(0) + fadds 24(%eax) + fadds 28(%eax) + fld %st(0) + fadds 16(%eax) + fadds 20(%eax) + fistp 384(%ecx) + fadd %st(2) + fistp 128(%ecx) + faddp %st(1) + fistp 128(%ebx) + flds 32(%edx) + fadds 48(%edx) + fistp 448(%ecx) + flds 48(%edx) + fadds 40(%edx) + fistp 320(%ecx) + flds 40(%edx) + fadds 56(%edx) + fistp 192(%ecx) + flds 56(%edx) + fadds 36(%edx) + fistp 64(%ecx) + flds 36(%edx) + fadds 52(%edx) + fistp 64(%ebx) + flds 52(%edx) + fadds 44(%edx) + fistp 192(%ebx) + flds 60(%edx) + fist 448(%ebx) + fadds 44(%edx) + fistp 320(%ebx) + flds 96(%edx) + fadds 112(%edx) + fld %st(0) + fadds 64(%edx) + fistp 480(%ecx) + fadds 80(%edx) + fistp 416(%ecx) + flds 112(%edx) + fadds 104(%edx) + fld %st(0) + fadds 80(%edx) + fistp 352(%ecx) + fadds 72(%edx) + fistp 288(%ecx) + flds 104(%edx) + fadds 120(%edx) + fld %st(0) + fadds 72(%edx) + fistp 224(%ecx) + fadds 88(%edx) + fistp 160(%ecx) + flds 120(%edx) + fadds 100(%edx) + fld %st(0) + fadds 88(%edx) + fistp 96(%ecx) + fadds 68(%edx) + fistp 32(%ecx) + flds 100(%edx) + fadds 116(%edx) + fld %st(0) + fadds 68(%edx) + fistp 32(%ebx) + fadds 84(%edx) + fistp 96(%ebx) + flds 116(%edx) + fadds 108(%edx) + fld %st(0) + fadds 84(%edx) + fistp 160(%ebx) + fadds 76(%edx) + fistp 224(%ebx) + flds 108(%edx) + fadds 124(%edx) + fld %st(0) + fadds 76(%edx) + fistp 288(%ebx) + fadds 92(%edx) + fistp 352(%ebx) + flds 124(%edx) + fist 480(%ebx) + fadds 92(%edx) + fistp 416(%ebx) + ffreep %st(0) + +/* NO_APP */ + movzwl (%ecx), %eax + movw %ax, (%ebx) + popl %ebx + movl %ebp, %esp + popl %ebp + ret + /* .size ASM_NAME(dct64_sse), .-ASM_NAME(dct64_sse) */ + +/* Mark non-executable stack. */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif