comparison decoders/libmpg123/decode_sse3d.h @ 562:7e08477b0fc1

MP3 decoder upgrade work. Ripped out SMPEG and mpglib support, replaced it with "mpg123.c" and libmpg123. libmpg123 is a much better version of mpglib, so it should solve all the problems about MP3's not seeking, or most modern MP3's not playing at all, etc. Since you no longer have to make a tradeoff with SMPEG for features, and SMPEG is basically rotting, I removed it from the project. There is still work to be done with libmpg123...there are MMX, 3DNow, SSE, Altivec, etc decoders which we don't have enabled at the moment, and the build system could use some work to make this compile more cleanly, etc. Still: huge win.
author Ryan C. Gordon <icculus@icculus.org>
date Fri, 30 Jan 2009 02:44:47 -0500
parents
children
comparison
equal deleted inserted replaced
561:f2985e08589c 562:7e08477b0fc1
1 /*
2 decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
3
4 copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by the mysterious higway for MMX (apparently)
7 then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
8 Both have agreed to distribution under LGPL 2.1 .
9
10 Transformed back into standalone asm, with help of
11 gcc -S -DHAVE_CONFIG_H -I. -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
12
13 The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
14 This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
15 That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
16 Maybe I'll change it in future, but now I need something that works.
17
18 Original comment from MPlayer source follows:
19 */
20
21 /*
22 * this code comes under GPL
23 * This code was taken from http://www.mpg123.org
24 * See ChangeLog of mpg123-0.59s-pre.1 for detail
25 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
26 *
27 * Local ChangeLog:
28 * - Partial loops unrolling and removing MOVW insn from loops
29 */
30
31 #include "mangle.h"
32
33 .data
34 ALIGN8
35 one_null:
36 .long -65536
37 .long -65536
38 ALIGN8
39 null_one:
40 .long 65535
41 .long 65535
42
43 .text
44 ALIGN16,,15
45 /* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
46 .globl SYNTH_NAME
47 SYNTH_NAME:
48 pushl %ebp
49 /* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
50 movl %esp, %ebp
51 /* Now the old stack addresses are preserved via %epb. */
52 subl $4,%esp /* What has been called temp before. */
53 pushl %edi
54 pushl %esi
55 pushl %ebx
56 #define TEMP 12(%esp)
57 #APP
58 movl 12(%ebp),%ecx
59 movl 16(%ebp),%edi
60 movl $15,%ebx
61 movl 24(%ebp),%edx
62 leal (%edi,%ecx,2),%edi
63 decl %ecx
64 movl 20(%ebp),%esi
65 movl (%edx),%eax
66 jecxz .L01
67 decl %eax
68 andl %ebx,%eax
69 leal 1088(%esi),%esi
70 movl %eax,(%edx)
71 .L01:
72 leal (%esi,%eax,2),%edx
73 movl %eax,TEMP
74 incl %eax
75 andl %ebx,%eax
76 leal 544(%esi,%eax,2),%ecx
77 incl %ebx
78 testl $1, %eax
79 jnz .L02
80 xchgl %edx,%ecx
81 incl TEMP
82 leal 544(%esi),%esi
83 .L02:
84 emms
85 pushl 8(%ebp)
86 pushl %edx
87 pushl %ecx
88 call MPL_DCT64
89 addl $12, %esp
90 leal 1(%ebx), %ecx
91 subl TEMP,%ebx
92 pushl %ecx
93 /* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
94 movl 28(%ebp),%ecx
95 leal (%ecx,%ebx,2), %edx
96 movl (%esp),%ecx /* restore, but leave value on stack */
97 shrl $1, %ecx
98 ALIGN16
99 .L03:
100 movq (%edx),%mm0
101 movq 64(%edx),%mm4
102 pmaddwd (%esi),%mm0
103 pmaddwd 32(%esi),%mm4
104 movq 8(%edx),%mm1
105 movq 72(%edx),%mm5
106 pmaddwd 8(%esi),%mm1
107 pmaddwd 40(%esi),%mm5
108 movq 16(%edx),%mm2
109 movq 80(%edx),%mm6
110 pmaddwd 16(%esi),%mm2
111 pmaddwd 48(%esi),%mm6
112 movq 24(%edx),%mm3
113 movq 88(%edx),%mm7
114 pmaddwd 24(%esi),%mm3
115 pmaddwd 56(%esi),%mm7
116 paddd %mm1,%mm0
117 paddd %mm5,%mm4
118 paddd %mm2,%mm0
119 paddd %mm6,%mm4
120 paddd %mm3,%mm0
121 paddd %mm7,%mm4
122 movq %mm0,%mm1
123 movq %mm4,%mm5
124 psrlq $32,%mm1
125 psrlq $32,%mm5
126 paddd %mm1,%mm0
127 paddd %mm5,%mm4
128 psrad $13,%mm0
129 psrad $13,%mm4
130 packssdw %mm0,%mm0
131 packssdw %mm4,%mm4
132 movq (%edi), %mm1
133 punpckldq %mm4, %mm0
134 pand one_null, %mm1
135 pand null_one, %mm0
136 por %mm0, %mm1
137 movq %mm1,(%edi)
138 leal 64(%esi),%esi
139 leal 128(%edx),%edx
140 leal 8(%edi),%edi
141 decl %ecx
142 jnz .L03
143 popl %ecx
144 andl $1, %ecx
145 jecxz .next_loop
146 movq (%edx),%mm0
147 pmaddwd (%esi),%mm0
148 movq 8(%edx),%mm1
149 pmaddwd 8(%esi),%mm1
150 movq 16(%edx),%mm2
151 pmaddwd 16(%esi),%mm2
152 movq 24(%edx),%mm3
153 pmaddwd 24(%esi),%mm3
154 paddd %mm1,%mm0
155 paddd %mm2,%mm0
156 paddd %mm3,%mm0
157 movq %mm0,%mm1
158 psrlq $32,%mm1
159 paddd %mm1,%mm0
160 psrad $13,%mm0
161 packssdw %mm0,%mm0
162 movd %mm0,%eax
163 movw %ax, (%edi)
164 leal 32(%esi),%esi
165 leal 64(%edx),%edx
166 leal 4(%edi),%edi
167 .next_loop:
168 subl $64,%esi
169 movl $7,%ecx
170 ALIGN16
171 .L04:
172 movq (%edx),%mm0
173 movq 64(%edx),%mm4
174 pmaddwd (%esi),%mm0
175 pmaddwd -32(%esi),%mm4
176 movq 8(%edx),%mm1
177 movq 72(%edx),%mm5
178 pmaddwd 8(%esi),%mm1
179 pmaddwd -24(%esi),%mm5
180 movq 16(%edx),%mm2
181 movq 80(%edx),%mm6
182 pmaddwd 16(%esi),%mm2
183 pmaddwd -16(%esi),%mm6
184 movq 24(%edx),%mm3
185 movq 88(%edx),%mm7
186 pmaddwd 24(%esi),%mm3
187 pmaddwd -8(%esi),%mm7
188 paddd %mm1,%mm0
189 paddd %mm5,%mm4
190 paddd %mm2,%mm0
191 paddd %mm6,%mm4
192 paddd %mm3,%mm0
193 paddd %mm7,%mm4
194 movq %mm0,%mm1
195 movq %mm4,%mm5
196 psrlq $32,%mm1
197 psrlq $32,%mm5
198 paddd %mm0,%mm1
199 paddd %mm4,%mm5
200 psrad $13,%mm1
201 psrad $13,%mm5
202 packssdw %mm1,%mm1
203 packssdw %mm5,%mm5
204 psubd %mm0,%mm0
205 psubd %mm4,%mm4
206 psubsw %mm1,%mm0
207 psubsw %mm5,%mm4
208 movq (%edi), %mm1
209 punpckldq %mm4, %mm0
210 pand one_null, %mm1
211 pand null_one, %mm0
212 por %mm0, %mm1
213 movq %mm1,(%edi)
214 subl $64,%esi
215 addl $128,%edx
216 leal 8(%edi),%edi
217 decl %ecx
218 jnz .L04
219 movq (%edx),%mm0
220 pmaddwd (%esi),%mm0
221 movq 8(%edx),%mm1
222 pmaddwd 8(%esi),%mm1
223 movq 16(%edx),%mm2
224 pmaddwd 16(%esi),%mm2
225 movq 24(%edx),%mm3
226 pmaddwd 24(%esi),%mm3
227 paddd %mm1,%mm0
228 paddd %mm2,%mm0
229 paddd %mm3,%mm0
230 movq %mm0,%mm1
231 psrlq $32,%mm1
232 paddd %mm0,%mm1
233 psrad $13,%mm1
234 packssdw %mm1,%mm1
235 psubd %mm0,%mm0
236 psubsw %mm1,%mm0
237 movd %mm0,%eax
238 movw %ax,(%edi)
239 emms
240
241 #NO_APP
242 popl %ebx
243 popl %esi
244 popl %edi
245 addl $4,%esp
246 popl %ebp
247 ret