comparison decoders/libmpg123/dct64_sse.S @ 562:7e08477b0fc1

MP3 decoder upgrade work. Ripped out SMPEG and mpglib support, replaced it with "mpg123.c" and libmpg123. libmpg123 is a much better version of mpglib, so it should solve all the problems about MP3's not seeking, or most modern MP3's not playing at all, etc. Since you no longer have to make a tradeoff with SMPEG for features, and SMPEG is basically rotting, I removed it from the project. There is still work to be done with libmpg123...there are MMX, 3DNow, SSE, Altivec, etc decoders which we don't have enabled at the moment, and the build system could use some work to make this compile more cleanly, etc. Still: huge win.
author Ryan C. Gordon <icculus@icculus.org>
date Fri, 30 Jan 2009 02:44:47 -0500
parents
children
comparison
equal deleted inserted replaced
561:f2985e08589c 562:7e08477b0fc1
1 /*
2 dct64_sse: MMX/SSE optimized dct64
3
4 copyright 2006-2007 by Zuxy Meng <zuxy.meng@gmail.com> / the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by the mysterious higway for MMX (apparently)
7 then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
8 Both have agreed to distribution under LGPL 2.1 .
9
10 Transformed back into standalone asm, with help of
11 gcc -S -DHAVE_CONFIG_H -I. -march=pentium3 -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o dct64_sse.{S,c}
12
13 Original comment from MPlayer source follows:
14 */
15
16 /*
17 * Discrete Cosine Tansform (DCT) for SSE
18 * based upon code from mp3lib/dct64.c, mp3lib/dct64_altivec.c
19 * and mp3lib/dct64_MMX.c
20 */
21
22 #include "mangle.h"
23
24 #ifndef __APPLE__
25 .section .rodata
26 #else
27 .data
28 #endif
29 ALIGN16
30 /* .type nnnn, @object
31 .size nnnn, 16 */
32 nnnn:
33 .long -2147483648
34 .long -2147483648
35 .long -2147483648
36 .long -2147483648
37 ALIGN16
38 /* .type ppnn, @object
39 .size ppnn, 16 */
40 ppnn:
41 .long 0
42 .long 0
43 .long -2147483648
44 .long -2147483648
45 ALIGN16
46 /* .type pnpn, @object
47 .size pnpn, 16 */
48 pnpn:
49 .long 0
50 .long -2147483648
51 .long 0
52 .long -2147483648
53 ALIGN4
54 /* .type one.4748, @object
55 .size one.4748, 4 */
56 one.4748:
57 .long 1065353216
58
59 .text
60 ALIGN16,,15
61 .globl ASM_NAME(dct64_sse)
62 /* .type ASM_NAME(dct64_sse), @function */
63 ASM_NAME(dct64_sse):
64 pushl %ebp
65 movl %esp, %ebp
66 /* stack from ebp: 0=ebp 4=back 8=arg0 12=arg1 16=arg2 */
67 #define ARG(n) (8+n*4)(%ebp)
68 andl $-16, %esp /* align the stack at 16 bytes */
69 subl $256, %esp /* reserve space for local b1 and b2 */
70 pushl %ebx
71 /* stack from esp: 0=ebx 4...131=b2 132...259=b1 */
72 #define B1OFF 132
73 #define B2OFF 4
74 #define B1(n) (B1OFF+n)(%esp)
75 #define B2(n) (B2OFF+n)(%esp)
76
77 movl ARG(2), %eax
78 movl ARG(0), %ecx
79 /* APP */
80 /* for (i = 0; i < 0x20 / 2; i += 4) cycle 1 */
81 movaps ASM_NAME(costab_mmxsse), %xmm3
82 shufps $27, %xmm3, %xmm3
83 MOVUAPS (%eax), %xmm1
84 movaps %xmm1, %xmm4
85 MOVUAPS 112(%eax), %xmm2
86 shufps $27, %xmm4, %xmm4
87 movaps %xmm2, %xmm0
88 shufps $27, %xmm0, %xmm0
89 addps %xmm0, %xmm1
90 movaps %xmm1, B1(0)
91 subps %xmm2, %xmm4
92 mulps %xmm3, %xmm4
93 movaps %xmm4, B1(112)
94
95 /* NO_APP */
96 movl ARG(1), %ebx
97 /* APP */
98 /* for (i = 0; i < 0x20 / 2; i += 4) cycle 2 */
99 movaps ASM_NAME(costab_mmxsse)+16, %xmm3
100 shufps $27, %xmm3, %xmm3
101 MOVUAPS 16(%eax), %xmm1
102 movaps %xmm1, %xmm4
103 MOVUAPS 96(%eax), %xmm2
104 shufps $27, %xmm4, %xmm4
105 movaps %xmm2, %xmm0
106 shufps $27, %xmm0, %xmm0
107 addps %xmm0, %xmm1
108 movaps %xmm1, B1(16)
109 subps %xmm2, %xmm4
110 mulps %xmm3, %xmm4
111 movaps %xmm4, B1(96)
112
113 /* for (i = 0; i < 0x20 / 2; i += 4) cycle 3 */
114 movaps ASM_NAME(costab_mmxsse)+32, %xmm3
115 shufps $27, %xmm3, %xmm3
116 MOVUAPS 32(%eax), %xmm1
117 movaps %xmm1, %xmm4
118 MOVUAPS 80(%eax), %xmm2
119 shufps $27, %xmm4, %xmm4
120 movaps %xmm2, %xmm0
121 shufps $27, %xmm0, %xmm0
122 addps %xmm0, %xmm1
123 movaps %xmm1, B1(32)
124 subps %xmm2, %xmm4
125 mulps %xmm3, %xmm4
126 movaps %xmm4, B1(80)
127
128 /* for (i = 0; i < 0x20 / 2; i += 4) cycle 4 */
129 movaps ASM_NAME(costab_mmxsse)+48, %xmm3
130 shufps $27, %xmm3, %xmm3
131 MOVUAPS 48(%eax), %xmm1
132 movaps %xmm1, %xmm4
133 MOVUAPS 64(%eax), %xmm2
134 shufps $27, %xmm4, %xmm4
135 movaps %xmm2, %xmm0
136 shufps $27, %xmm0, %xmm0
137 addps %xmm0, %xmm1
138 movaps %xmm1, B1(48)
139 subps %xmm2, %xmm4
140 mulps %xmm3, %xmm4
141 movaps %xmm4, B1(64)
142
143 movaps B1(0), %xmm1
144 movaps B1(16), %xmm3
145 movaps B1(32), %xmm4
146 movaps B1(48), %xmm6
147 movaps %xmm1, %xmm7
148 shufps $27, %xmm7, %xmm7
149 movaps %xmm3, %xmm5
150 shufps $27, %xmm5, %xmm5
151 movaps %xmm4, %xmm2
152 shufps $27, %xmm2, %xmm2
153 movaps %xmm6, %xmm0
154 shufps $27, %xmm0, %xmm0
155 addps %xmm0, %xmm1
156 movaps %xmm1, B2(0)
157 addps %xmm2, %xmm3
158 movaps %xmm3, B2(16)
159 subps %xmm4, %xmm5
160 movaps %xmm5, B2(32)
161 subps %xmm6, %xmm7
162 movaps %xmm7, B2(48)
163
164 movaps B1(64), %xmm1
165 movaps B1(80), %xmm3
166 movaps B1(96), %xmm4
167 movaps B1(112), %xmm6
168 movaps %xmm1, %xmm7
169 shufps $27, %xmm7, %xmm7
170 movaps %xmm3, %xmm5
171 shufps $27, %xmm5, %xmm5
172 movaps %xmm4, %xmm2
173 shufps $27, %xmm2, %xmm2
174 movaps %xmm6, %xmm0
175 shufps $27, %xmm0, %xmm0
176 addps %xmm0, %xmm1
177 movaps %xmm1, B2(64)
178 addps %xmm2, %xmm3
179 movaps %xmm3, B2(80)
180 subps %xmm4, %xmm5
181 movaps %xmm5, B2(96)
182 subps %xmm6, %xmm7
183 movaps %xmm7, B2(112)
184
185 movaps B2(32), %xmm0
186 movaps B2(48), %xmm1
187 movaps ASM_NAME(costab_mmxsse)+64, %xmm4
188 xorps %xmm6, %xmm6
189 shufps $27, %xmm4, %xmm4
190 mulps %xmm4, %xmm1
191 movaps ASM_NAME(costab_mmxsse)+80, %xmm2
192 xorps %xmm7, %xmm7
193 shufps $27, %xmm2, %xmm2
194 mulps %xmm2, %xmm0
195 movaps %xmm0, B2(32)
196 movaps %xmm1, B2(48)
197 movaps B2(96), %xmm3
198 mulps %xmm2, %xmm3
199 subps %xmm3, %xmm6
200 movaps %xmm6, B2(96)
201 movaps B2(112), %xmm5
202 mulps %xmm4, %xmm5
203 subps %xmm5, %xmm7
204 movaps %xmm7, B2(112)
205
206 movaps ASM_NAME(costab_mmxsse)+96, %xmm0
207 shufps $27, %xmm0, %xmm0
208 movaps nnnn, %xmm5
209 movaps %xmm5, %xmm6
210
211 movaps B2(0), %xmm2
212 movaps B2(16), %xmm3
213 movaps %xmm2, %xmm4
214 xorps %xmm5, %xmm6
215 shufps $27, %xmm4, %xmm4
216 movaps %xmm3, %xmm1
217 shufps $27, %xmm1, %xmm1
218 addps %xmm1, %xmm2
219 movaps %xmm2, B1(0)
220 subps %xmm3, %xmm4
221 xorps %xmm6, %xmm4
222 mulps %xmm0, %xmm4
223 movaps %xmm4, B1(16)
224
225 movaps B2(32), %xmm2
226 movaps B2(48), %xmm3
227 movaps %xmm2, %xmm4
228 xorps %xmm5, %xmm6
229 shufps $27, %xmm4, %xmm4
230 movaps %xmm3, %xmm1
231 shufps $27, %xmm1, %xmm1
232 addps %xmm1, %xmm2
233 movaps %xmm2, B1(32)
234 subps %xmm3, %xmm4
235 xorps %xmm6, %xmm4
236 mulps %xmm0, %xmm4
237 movaps %xmm4, B1(48)
238
239 movaps B2(64), %xmm2
240 movaps B2(80), %xmm3
241 movaps %xmm2, %xmm4
242 xorps %xmm5, %xmm6
243 shufps $27, %xmm4, %xmm4
244 movaps %xmm3, %xmm1
245 shufps $27, %xmm1, %xmm1
246 addps %xmm1, %xmm2
247 movaps %xmm2, B1(64)
248 subps %xmm3, %xmm4
249 xorps %xmm6, %xmm4
250 mulps %xmm0, %xmm4
251 movaps %xmm4, B1(80)
252
253 movaps B2(96), %xmm2
254 movaps B2(112), %xmm3
255 movaps %xmm2, %xmm4
256 xorps %xmm5, %xmm6
257 shufps $27, %xmm4, %xmm4
258 movaps %xmm3, %xmm1
259 shufps $27, %xmm1, %xmm1
260 addps %xmm1, %xmm2
261 movaps %xmm2, B1(96)
262 subps %xmm3, %xmm4
263 xorps %xmm6, %xmm4
264 mulps %xmm0, %xmm4
265 movaps %xmm4, B1(112)
266
267 movss one.4748, %xmm1
268 movss ASM_NAME(costab_mmxsse)+112, %xmm0
269 movaps %xmm1, %xmm3
270 unpcklps %xmm0, %xmm3
271 movss ASM_NAME(costab_mmxsse)+116, %xmm2
272 movaps %xmm1, %xmm0
273 unpcklps %xmm2, %xmm0
274 unpcklps %xmm3, %xmm0
275 movaps ppnn, %xmm2
276
277 movaps B1(0), %xmm3
278 movaps %xmm3, %xmm4
279 shufps $20, %xmm4, %xmm4
280 shufps $235, %xmm3, %xmm3
281 xorps %xmm2, %xmm3
282 addps %xmm3, %xmm4
283 mulps %xmm0, %xmm4
284 movaps %xmm4, B2(0)
285 movaps B1(16), %xmm6
286 movaps %xmm6, %xmm5
287 shufps $27, %xmm5, %xmm5
288 xorps %xmm2, %xmm5
289 addps %xmm5, %xmm6
290 mulps %xmm0, %xmm6
291 movaps %xmm6, B2(16)
292
293 movaps B1(32), %xmm3
294 movaps %xmm3, %xmm4
295 shufps $20, %xmm4, %xmm4
296 shufps $235, %xmm3, %xmm3
297 xorps %xmm2, %xmm3
298 addps %xmm3, %xmm4
299 mulps %xmm0, %xmm4
300 movaps %xmm4, B2(32)
301 movaps B1(48), %xmm6
302 movaps %xmm6, %xmm5
303 shufps $27, %xmm5, %xmm5
304 xorps %xmm2, %xmm5
305 addps %xmm5, %xmm6
306 mulps %xmm0, %xmm6
307 movaps %xmm6, B2(48)
308
309 movaps B1(64), %xmm3
310 movaps %xmm3, %xmm4
311 shufps $20, %xmm4, %xmm4
312 shufps $235, %xmm3, %xmm3
313 xorps %xmm2, %xmm3
314 addps %xmm3, %xmm4
315 mulps %xmm0, %xmm4
316 movaps %xmm4, B2(64)
317 movaps B1(80), %xmm6
318 movaps %xmm6, %xmm5
319 shufps $27, %xmm5, %xmm5
320 xorps %xmm2, %xmm5
321 addps %xmm5, %xmm6
322 mulps %xmm0, %xmm6
323 movaps %xmm6, B2(80)
324
325 movaps B1(96), %xmm3
326 movaps %xmm3, %xmm4
327 shufps $20, %xmm4, %xmm4
328 shufps $235, %xmm3, %xmm3
329 xorps %xmm2, %xmm3
330 addps %xmm3, %xmm4
331 mulps %xmm0, %xmm4
332 movaps %xmm4, B2(96)
333 movaps B1(112), %xmm6
334 movaps %xmm6, %xmm5
335 shufps $27, %xmm5, %xmm5
336 xorps %xmm2, %xmm5
337 addps %xmm5, %xmm6
338 mulps %xmm0, %xmm6
339 movaps %xmm6, B2(112)
340
341 movss ASM_NAME(costab_mmxsse)+120, %xmm0
342 movaps %xmm1, %xmm2
343 movaps %xmm0, %xmm7
344 unpcklps %xmm1, %xmm2
345 unpcklps %xmm0, %xmm7
346 movaps pnpn, %xmm0
347 unpcklps %xmm7, %xmm2
348
349 movaps B2(32), %xmm1
350 movaps %xmm1, %xmm3
351 shufps $224, %xmm3, %xmm3
352 shufps $181, %xmm1, %xmm1
353 xorps %xmm0, %xmm1
354 addps %xmm1, %xmm3
355 mulps %xmm2, %xmm3
356 movaps %xmm3, B1(32)
357 movaps B2(48), %xmm4
358 movaps %xmm4, %xmm5
359 shufps $224, %xmm5, %xmm5
360 shufps $181, %xmm4, %xmm4
361 xorps %xmm0, %xmm4
362 addps %xmm4, %xmm5
363 mulps %xmm2, %xmm5
364 movaps %xmm5, B1(48)
365
366 movaps B2(64), %xmm1
367 movaps %xmm1, %xmm3
368 shufps $224, %xmm3, %xmm3
369 shufps $181, %xmm1, %xmm1
370 xorps %xmm0, %xmm1
371 addps %xmm1, %xmm3
372 mulps %xmm2, %xmm3
373 movaps %xmm3, B1(64)
374 movaps B2(80), %xmm4
375 movaps %xmm4, %xmm5
376 shufps $224, %xmm5, %xmm5
377 shufps $181, %xmm4, %xmm4
378 xorps %xmm0, %xmm4
379 addps %xmm4, %xmm5
380 mulps %xmm2, %xmm5
381 movaps %xmm5, B1(80)
382
383 movaps B2(96), %xmm1
384 movaps %xmm1, %xmm3
385 shufps $224, %xmm3, %xmm3
386 shufps $181, %xmm1, %xmm1
387 xorps %xmm0, %xmm1
388 addps %xmm1, %xmm3
389 mulps %xmm2, %xmm3
390 movaps %xmm3, B1(96)
391 movaps B2(112), %xmm4
392 movaps %xmm4, %xmm5
393 shufps $224, %xmm5, %xmm5
394 shufps $181, %xmm4, %xmm4
395 xorps %xmm0, %xmm4
396 addps %xmm4, %xmm5
397 mulps %xmm2, %xmm5
398 movaps %xmm5, B1(112)
399
400 /* NO_APP */
401 flds B1(40)
402 movl %esp, %edx
403 addl $B1OFF, %edx
404 movl %esp, %eax
405 addl $B2OFF, %eax
406 fadds B1(44)
407 fstps B1(40)
408 flds B1(56)
409 fadds B1(60)
410 flds B1(48)
411 fadd %st(1), %st
412 fstps B1(48)
413 fadds B1(52)
414 fstps B1(56)
415 flds B1(52)
416 fadds B1(60)
417 fstps B1(52)
418 flds B1(72)
419 fadds B1(76)
420 fstps B1(72)
421 flds B1(88)
422 fadds B1(92)
423 flds B1(80)
424 fadd %st(1), %st
425 fstps B1(80)
426 fadds B1(84)
427 fstps B1(88)
428 flds B1(84)
429 fadds B1(92)
430 fstps B1(84)
431 flds B1(104)
432 fadds B1(108)
433 fstps B1(104)
434 flds B1(120)
435 fadds B1(124)
436 flds B1(112)
437 fadd %st(1), %st
438 fstps B1(112)
439 fadds B1(116)
440 fstps B1(120)
441 flds B1(116)
442 fadds B1(124)
443 fstps B1(116)
444 /* APP */
445 flds ASM_NAME(costab_mmxsse)+120
446 flds (%eax)
447 fadds 4(%eax)
448 fistp 512(%ecx)
449 flds (%eax)
450 fsubs 4(%eax)
451 fmul %st(1)
452 fistp (%ecx)
453 flds 12(%eax)
454 fsubs 8(%eax)
455 fmul %st(1)
456 fist 256(%ebx)
457 fadds 12(%eax)
458 fadds 8(%eax)
459 fistp 256(%ecx)
460 flds 16(%eax)
461 fsubs 20(%eax)
462 fmul %st(1)
463 flds 28(%eax)
464 fsubs 24(%eax)
465 fmul %st(2)
466 fist 384(%ebx)
467 fld %st(0)
468 fadds 24(%eax)
469 fadds 28(%eax)
470 fld %st(0)
471 fadds 16(%eax)
472 fadds 20(%eax)
473 fistp 384(%ecx)
474 fadd %st(2)
475 fistp 128(%ecx)
476 faddp %st(1)
477 fistp 128(%ebx)
478 flds 32(%edx)
479 fadds 48(%edx)
480 fistp 448(%ecx)
481 flds 48(%edx)
482 fadds 40(%edx)
483 fistp 320(%ecx)
484 flds 40(%edx)
485 fadds 56(%edx)
486 fistp 192(%ecx)
487 flds 56(%edx)
488 fadds 36(%edx)
489 fistp 64(%ecx)
490 flds 36(%edx)
491 fadds 52(%edx)
492 fistp 64(%ebx)
493 flds 52(%edx)
494 fadds 44(%edx)
495 fistp 192(%ebx)
496 flds 60(%edx)
497 fist 448(%ebx)
498 fadds 44(%edx)
499 fistp 320(%ebx)
500 flds 96(%edx)
501 fadds 112(%edx)
502 fld %st(0)
503 fadds 64(%edx)
504 fistp 480(%ecx)
505 fadds 80(%edx)
506 fistp 416(%ecx)
507 flds 112(%edx)
508 fadds 104(%edx)
509 fld %st(0)
510 fadds 80(%edx)
511 fistp 352(%ecx)
512 fadds 72(%edx)
513 fistp 288(%ecx)
514 flds 104(%edx)
515 fadds 120(%edx)
516 fld %st(0)
517 fadds 72(%edx)
518 fistp 224(%ecx)
519 fadds 88(%edx)
520 fistp 160(%ecx)
521 flds 120(%edx)
522 fadds 100(%edx)
523 fld %st(0)
524 fadds 88(%edx)
525 fistp 96(%ecx)
526 fadds 68(%edx)
527 fistp 32(%ecx)
528 flds 100(%edx)
529 fadds 116(%edx)
530 fld %st(0)
531 fadds 68(%edx)
532 fistp 32(%ebx)
533 fadds 84(%edx)
534 fistp 96(%ebx)
535 flds 116(%edx)
536 fadds 108(%edx)
537 fld %st(0)
538 fadds 84(%edx)
539 fistp 160(%ebx)
540 fadds 76(%edx)
541 fistp 224(%ebx)
542 flds 108(%edx)
543 fadds 124(%edx)
544 fld %st(0)
545 fadds 76(%edx)
546 fistp 288(%ebx)
547 fadds 92(%edx)
548 fistp 352(%ebx)
549 flds 124(%edx)
550 fist 480(%ebx)
551 fadds 92(%edx)
552 fistp 416(%ebx)
553 ffreep %st(0)
554
555 /* NO_APP */
556 movzwl (%ecx), %eax
557 movw %ax, (%ebx)
558 popl %ebx
559 movl %ebp, %esp
560 popl %ebp
561 ret
562 /* .size ASM_NAME(dct64_sse), .-ASM_NAME(dct64_sse) */
563
564 /* Mark non-executable stack. */
565 #if defined(__linux__) && defined(__ELF__)
566 .section .note.GNU-stack,"",%progbits
567 #endif