diff decoders/libmpg123/decode_sse3d.h @ 562:7e08477b0fc1

MP3 decoder upgrade work. Ripped out SMPEG and mpglib support, replaced it with "mpg123.c" and libmpg123. libmpg123 is a much better version of mpglib, so it should solve all the problems about MP3's not seeking, or most modern MP3's not playing at all, etc. Since you no longer have to make a tradeoff with SMPEG for features, and SMPEG is basically rotting, I removed it from the project. There is still work to be done with libmpg123...there are MMX, 3DNow, SSE, Altivec, etc decoders which we don't have enabled at the moment, and the build system could use some work to make this compile more cleanly, etc. Still: huge win.
author Ryan C. Gordon <icculus@icculus.org>
date Fri, 30 Jan 2009 02:44:47 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/decoders/libmpg123/decode_sse3d.h	Fri Jan 30 02:44:47 2009 -0500
@@ -0,0 +1,247 @@
+/*
+	decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
+
+	copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
+	see COPYING and AUTHORS files in distribution or http://mpg123.org
+	initially written by the mysterious higway for MMX (apparently)
+	then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
+	Both have agreed to distribution under LGPL 2.1 .
+
+	Transformed back into standalone asm, with help of
+	gcc -S -DHAVE_CONFIG_H -I.  -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
+
+	The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
+	This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
+	That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
+	Maybe I'll change it in future, but now I need something that works.
+
+	Original comment from MPlayer source follows:
+*/
+
+/*
+ * this code comes under GPL
+ * This code was taken from http://www.mpg123.org
+ * See ChangeLog of mpg123-0.59s-pre.1 for detail
+ * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
+ *
+ * Local ChangeLog:
+ * - Partial loops unrolling and removing MOVW insn from loops
+*/
+
+#include "mangle.h"
+
+	.data
+	ALIGN8
+one_null:
+	.long	-65536
+	.long	-65536
+	ALIGN8
+null_one:
+	.long	65535
+	.long	65535
+
+	.text
+	ALIGN16,,15
+	/* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
+.globl SYNTH_NAME
+SYNTH_NAME:
+	pushl	%ebp
+/* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
+	movl	%esp, %ebp
+/* Now the old stack addresses are preserved via %epb. */
+	subl  $4,%esp /* What has been called temp before. */
+	pushl	%edi
+	pushl	%esi
+	pushl	%ebx
+#define TEMP 12(%esp)
+#APP
+	movl 12(%ebp),%ecx
+	movl 16(%ebp),%edi
+	movl $15,%ebx
+	movl 24(%ebp),%edx
+	leal (%edi,%ecx,2),%edi
+	decl %ecx
+	movl 20(%ebp),%esi
+	movl (%edx),%eax
+	jecxz .L01
+	decl %eax
+	andl %ebx,%eax
+	leal 1088(%esi),%esi
+	movl %eax,(%edx)
+	.L01:
+	leal (%esi,%eax,2),%edx
+	movl %eax,TEMP
+	incl %eax
+	andl %ebx,%eax
+	leal 544(%esi,%eax,2),%ecx
+	incl %ebx
+	testl $1, %eax
+	jnz .L02
+	xchgl %edx,%ecx
+	incl TEMP
+	leal 544(%esi),%esi
+	.L02:
+	emms
+	pushl 8(%ebp)
+	pushl %edx
+	pushl %ecx
+	call MPL_DCT64
+	addl $12, %esp
+	leal 1(%ebx), %ecx
+	subl TEMP,%ebx
+	pushl %ecx
+	/* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
+	movl 28(%ebp),%ecx
+	leal (%ecx,%ebx,2), %edx
+	movl (%esp),%ecx /* restore, but leave value on stack */
+	shrl $1, %ecx
+	ALIGN16
+	.L03:
+	movq  (%edx),%mm0
+	movq  64(%edx),%mm4
+	pmaddwd (%esi),%mm0
+	pmaddwd 32(%esi),%mm4
+	movq  8(%edx),%mm1
+	movq  72(%edx),%mm5
+	pmaddwd 8(%esi),%mm1
+	pmaddwd 40(%esi),%mm5
+	movq  16(%edx),%mm2
+	movq  80(%edx),%mm6
+	pmaddwd 16(%esi),%mm2
+	pmaddwd 48(%esi),%mm6
+	movq  24(%edx),%mm3
+	movq  88(%edx),%mm7
+	pmaddwd 24(%esi),%mm3
+	pmaddwd 56(%esi),%mm7
+	paddd %mm1,%mm0
+	paddd %mm5,%mm4
+	paddd %mm2,%mm0
+	paddd %mm6,%mm4
+	paddd %mm3,%mm0
+	paddd %mm7,%mm4
+	movq  %mm0,%mm1
+	movq  %mm4,%mm5
+	psrlq $32,%mm1
+	psrlq $32,%mm5
+	paddd %mm1,%mm0
+	paddd %mm5,%mm4
+	psrad $13,%mm0
+	psrad $13,%mm4
+	packssdw %mm0,%mm0
+	packssdw %mm4,%mm4
+	movq	(%edi), %mm1
+	punpckldq %mm4, %mm0
+	pand   one_null, %mm1
+	pand   null_one, %mm0
+	por    %mm0, %mm1
+	movq   %mm1,(%edi)
+	leal 64(%esi),%esi
+	leal 128(%edx),%edx
+	leal 8(%edi),%edi
+	decl %ecx
+	jnz  .L03
+	popl %ecx
+	andl $1, %ecx
+	jecxz .next_loop
+	movq  (%edx),%mm0
+	pmaddwd (%esi),%mm0
+	movq  8(%edx),%mm1
+	pmaddwd 8(%esi),%mm1
+	movq  16(%edx),%mm2
+	pmaddwd 16(%esi),%mm2
+	movq  24(%edx),%mm3
+	pmaddwd 24(%esi),%mm3
+	paddd %mm1,%mm0
+	paddd %mm2,%mm0
+	paddd %mm3,%mm0
+	movq  %mm0,%mm1
+	psrlq $32,%mm1
+	paddd %mm1,%mm0
+	psrad $13,%mm0
+	packssdw %mm0,%mm0
+	movd %mm0,%eax
+	movw %ax, (%edi)
+	leal 32(%esi),%esi
+	leal 64(%edx),%edx
+	leal 4(%edi),%edi
+	.next_loop:
+	subl $64,%esi
+	movl $7,%ecx
+	ALIGN16
+	.L04:
+	movq  (%edx),%mm0
+	movq  64(%edx),%mm4
+	pmaddwd (%esi),%mm0
+	pmaddwd -32(%esi),%mm4
+	movq  8(%edx),%mm1
+	movq  72(%edx),%mm5
+	pmaddwd 8(%esi),%mm1
+	pmaddwd -24(%esi),%mm5
+	movq  16(%edx),%mm2
+	movq  80(%edx),%mm6
+	pmaddwd 16(%esi),%mm2
+	pmaddwd -16(%esi),%mm6
+	movq  24(%edx),%mm3
+	movq  88(%edx),%mm7
+	pmaddwd 24(%esi),%mm3
+	pmaddwd -8(%esi),%mm7
+	paddd %mm1,%mm0
+	paddd %mm5,%mm4
+	paddd %mm2,%mm0
+	paddd %mm6,%mm4
+	paddd %mm3,%mm0
+	paddd %mm7,%mm4
+	movq  %mm0,%mm1
+	movq  %mm4,%mm5
+	psrlq $32,%mm1
+	psrlq $32,%mm5
+	paddd %mm0,%mm1
+	paddd %mm4,%mm5
+	psrad $13,%mm1
+	psrad $13,%mm5
+	packssdw %mm1,%mm1
+	packssdw %mm5,%mm5
+	psubd %mm0,%mm0
+	psubd %mm4,%mm4
+	psubsw %mm1,%mm0
+	psubsw %mm5,%mm4
+	movq	(%edi), %mm1
+	punpckldq %mm4, %mm0
+	pand   one_null, %mm1
+	pand   null_one, %mm0
+	por    %mm0, %mm1
+	movq   %mm1,(%edi)
+	subl $64,%esi
+	addl $128,%edx
+	leal 8(%edi),%edi
+	decl %ecx
+	jnz  .L04
+	movq  (%edx),%mm0
+	pmaddwd (%esi),%mm0
+	movq  8(%edx),%mm1
+	pmaddwd 8(%esi),%mm1
+	movq  16(%edx),%mm2
+	pmaddwd 16(%esi),%mm2
+	movq  24(%edx),%mm3
+	pmaddwd 24(%esi),%mm3
+	paddd %mm1,%mm0
+	paddd %mm2,%mm0
+	paddd %mm3,%mm0
+	movq  %mm0,%mm1
+	psrlq $32,%mm1
+	paddd %mm0,%mm1
+	psrad $13,%mm1
+	packssdw %mm1,%mm1
+	psubd %mm0,%mm0
+	psubsw %mm1,%mm0
+	movd %mm0,%eax
+	movw %ax,(%edi)
+	emms
+	
+#NO_APP
+	popl	%ebx
+	popl	%esi
+	popl	%edi
+	addl $4,%esp
+	popl	%ebp
+	ret