562
|
1 /*
|
|
2 decode_sse3d: Synth for SSE and extended 3DNow (yeah, the name is a relic)
|
|
3
|
|
4 copyright 2006-2007 by Zuxy Meng/the mpg123 project - free software under the terms of the LGPL 2.1
|
|
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
6 initially written by the mysterious higway for MMX (apparently)
|
|
7 then developed into SSE opt by Zuxy Meng, also building on Romain Dolbeau's AltiVec
|
|
8 Both have agreed to distribution under LGPL 2.1 .
|
|
9
|
|
10 Transformed back into standalone asm, with help of
|
|
11 gcc -S -DHAVE_CONFIG_H -I. -march=pentium -O3 -Wall -pedantic -fno-strict-aliasing -DREAL_IS_FLOAT -c -o decode_mmxsse.{S,c}
|
|
12
|
|
13 The difference between SSE and 3DNowExt is the dct64 function and the synth function name.
|
|
14 This template here uses the SYNTH_NAME and MPL_DCT64 macros for this - see decode_sse.S and decode_3dnowext.S...
|
|
15 That's not memory efficient since there's doubled code, but it's easier than giving another function pointer.
|
|
16 Maybe I'll change it in future, but now I need something that works.
|
|
17
|
|
18 Original comment from MPlayer source follows:
|
|
19 */
|
|
20
|
|
21 /*
|
|
22 * this code comes under GPL
|
|
23 * This code was taken from http://www.mpg123.org
|
|
24 * See ChangeLog of mpg123-0.59s-pre.1 for detail
|
|
25 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
|
|
26 *
|
|
27 * Local ChangeLog:
|
|
28 * - Partial loops unrolling and removing MOVW insn from loops
|
|
29 */
|
|
30
|
|
31 #include "mangle.h"
|
|
32
|
|
33 .data
|
|
34 ALIGN8
|
|
35 one_null:
|
|
36 .long -65536
|
|
37 .long -65536
|
|
38 ALIGN8
|
|
39 null_one:
|
|
40 .long 65535
|
|
41 .long 65535
|
|
42
|
|
43 .text
|
|
44 ALIGN16,,15
|
|
45 /* void SYNTH_NAME(real *bandPtr, int channel, short *samples, short *buffs, int *bo, float *decwins) */
|
|
46 .globl SYNTH_NAME
|
|
47 SYNTH_NAME:
|
|
48 pushl %ebp
|
|
49 /* stack:0=ebp 4=back 8=bandptr 12=channel 16=samples 20=buffs 24=bo 28=decwins */
|
|
50 movl %esp, %ebp
|
|
51 /* Now the old stack addresses are preserved via %epb. */
|
|
52 subl $4,%esp /* What has been called temp before. */
|
|
53 pushl %edi
|
|
54 pushl %esi
|
|
55 pushl %ebx
|
|
56 #define TEMP 12(%esp)
|
|
57 #APP
|
|
58 movl 12(%ebp),%ecx
|
|
59 movl 16(%ebp),%edi
|
|
60 movl $15,%ebx
|
|
61 movl 24(%ebp),%edx
|
|
62 leal (%edi,%ecx,2),%edi
|
|
63 decl %ecx
|
|
64 movl 20(%ebp),%esi
|
|
65 movl (%edx),%eax
|
|
66 jecxz .L01
|
|
67 decl %eax
|
|
68 andl %ebx,%eax
|
|
69 leal 1088(%esi),%esi
|
|
70 movl %eax,(%edx)
|
|
71 .L01:
|
|
72 leal (%esi,%eax,2),%edx
|
|
73 movl %eax,TEMP
|
|
74 incl %eax
|
|
75 andl %ebx,%eax
|
|
76 leal 544(%esi,%eax,2),%ecx
|
|
77 incl %ebx
|
|
78 testl $1, %eax
|
|
79 jnz .L02
|
|
80 xchgl %edx,%ecx
|
|
81 incl TEMP
|
|
82 leal 544(%esi),%esi
|
|
83 .L02:
|
|
84 emms
|
|
85 pushl 8(%ebp)
|
|
86 pushl %edx
|
|
87 pushl %ecx
|
|
88 call MPL_DCT64
|
|
89 addl $12, %esp
|
|
90 leal 1(%ebx), %ecx
|
|
91 subl TEMP,%ebx
|
|
92 pushl %ecx
|
|
93 /* leal ASM_NAME(decwins)(%ebx,%ebx,1), %edx */
|
|
94 movl 28(%ebp),%ecx
|
|
95 leal (%ecx,%ebx,2), %edx
|
|
96 movl (%esp),%ecx /* restore, but leave value on stack */
|
|
97 shrl $1, %ecx
|
|
98 ALIGN16
|
|
99 .L03:
|
|
100 movq (%edx),%mm0
|
|
101 movq 64(%edx),%mm4
|
|
102 pmaddwd (%esi),%mm0
|
|
103 pmaddwd 32(%esi),%mm4
|
|
104 movq 8(%edx),%mm1
|
|
105 movq 72(%edx),%mm5
|
|
106 pmaddwd 8(%esi),%mm1
|
|
107 pmaddwd 40(%esi),%mm5
|
|
108 movq 16(%edx),%mm2
|
|
109 movq 80(%edx),%mm6
|
|
110 pmaddwd 16(%esi),%mm2
|
|
111 pmaddwd 48(%esi),%mm6
|
|
112 movq 24(%edx),%mm3
|
|
113 movq 88(%edx),%mm7
|
|
114 pmaddwd 24(%esi),%mm3
|
|
115 pmaddwd 56(%esi),%mm7
|
|
116 paddd %mm1,%mm0
|
|
117 paddd %mm5,%mm4
|
|
118 paddd %mm2,%mm0
|
|
119 paddd %mm6,%mm4
|
|
120 paddd %mm3,%mm0
|
|
121 paddd %mm7,%mm4
|
|
122 movq %mm0,%mm1
|
|
123 movq %mm4,%mm5
|
|
124 psrlq $32,%mm1
|
|
125 psrlq $32,%mm5
|
|
126 paddd %mm1,%mm0
|
|
127 paddd %mm5,%mm4
|
|
128 psrad $13,%mm0
|
|
129 psrad $13,%mm4
|
|
130 packssdw %mm0,%mm0
|
|
131 packssdw %mm4,%mm4
|
|
132 movq (%edi), %mm1
|
|
133 punpckldq %mm4, %mm0
|
|
134 pand one_null, %mm1
|
|
135 pand null_one, %mm0
|
|
136 por %mm0, %mm1
|
|
137 movq %mm1,(%edi)
|
|
138 leal 64(%esi),%esi
|
|
139 leal 128(%edx),%edx
|
|
140 leal 8(%edi),%edi
|
|
141 decl %ecx
|
|
142 jnz .L03
|
|
143 popl %ecx
|
|
144 andl $1, %ecx
|
|
145 jecxz .next_loop
|
|
146 movq (%edx),%mm0
|
|
147 pmaddwd (%esi),%mm0
|
|
148 movq 8(%edx),%mm1
|
|
149 pmaddwd 8(%esi),%mm1
|
|
150 movq 16(%edx),%mm2
|
|
151 pmaddwd 16(%esi),%mm2
|
|
152 movq 24(%edx),%mm3
|
|
153 pmaddwd 24(%esi),%mm3
|
|
154 paddd %mm1,%mm0
|
|
155 paddd %mm2,%mm0
|
|
156 paddd %mm3,%mm0
|
|
157 movq %mm0,%mm1
|
|
158 psrlq $32,%mm1
|
|
159 paddd %mm1,%mm0
|
|
160 psrad $13,%mm0
|
|
161 packssdw %mm0,%mm0
|
|
162 movd %mm0,%eax
|
|
163 movw %ax, (%edi)
|
|
164 leal 32(%esi),%esi
|
|
165 leal 64(%edx),%edx
|
|
166 leal 4(%edi),%edi
|
|
167 .next_loop:
|
|
168 subl $64,%esi
|
|
169 movl $7,%ecx
|
|
170 ALIGN16
|
|
171 .L04:
|
|
172 movq (%edx),%mm0
|
|
173 movq 64(%edx),%mm4
|
|
174 pmaddwd (%esi),%mm0
|
|
175 pmaddwd -32(%esi),%mm4
|
|
176 movq 8(%edx),%mm1
|
|
177 movq 72(%edx),%mm5
|
|
178 pmaddwd 8(%esi),%mm1
|
|
179 pmaddwd -24(%esi),%mm5
|
|
180 movq 16(%edx),%mm2
|
|
181 movq 80(%edx),%mm6
|
|
182 pmaddwd 16(%esi),%mm2
|
|
183 pmaddwd -16(%esi),%mm6
|
|
184 movq 24(%edx),%mm3
|
|
185 movq 88(%edx),%mm7
|
|
186 pmaddwd 24(%esi),%mm3
|
|
187 pmaddwd -8(%esi),%mm7
|
|
188 paddd %mm1,%mm0
|
|
189 paddd %mm5,%mm4
|
|
190 paddd %mm2,%mm0
|
|
191 paddd %mm6,%mm4
|
|
192 paddd %mm3,%mm0
|
|
193 paddd %mm7,%mm4
|
|
194 movq %mm0,%mm1
|
|
195 movq %mm4,%mm5
|
|
196 psrlq $32,%mm1
|
|
197 psrlq $32,%mm5
|
|
198 paddd %mm0,%mm1
|
|
199 paddd %mm4,%mm5
|
|
200 psrad $13,%mm1
|
|
201 psrad $13,%mm5
|
|
202 packssdw %mm1,%mm1
|
|
203 packssdw %mm5,%mm5
|
|
204 psubd %mm0,%mm0
|
|
205 psubd %mm4,%mm4
|
|
206 psubsw %mm1,%mm0
|
|
207 psubsw %mm5,%mm4
|
|
208 movq (%edi), %mm1
|
|
209 punpckldq %mm4, %mm0
|
|
210 pand one_null, %mm1
|
|
211 pand null_one, %mm0
|
|
212 por %mm0, %mm1
|
|
213 movq %mm1,(%edi)
|
|
214 subl $64,%esi
|
|
215 addl $128,%edx
|
|
216 leal 8(%edi),%edi
|
|
217 decl %ecx
|
|
218 jnz .L04
|
|
219 movq (%edx),%mm0
|
|
220 pmaddwd (%esi),%mm0
|
|
221 movq 8(%edx),%mm1
|
|
222 pmaddwd 8(%esi),%mm1
|
|
223 movq 16(%edx),%mm2
|
|
224 pmaddwd 16(%esi),%mm2
|
|
225 movq 24(%edx),%mm3
|
|
226 pmaddwd 24(%esi),%mm3
|
|
227 paddd %mm1,%mm0
|
|
228 paddd %mm2,%mm0
|
|
229 paddd %mm3,%mm0
|
|
230 movq %mm0,%mm1
|
|
231 psrlq $32,%mm1
|
|
232 paddd %mm0,%mm1
|
|
233 psrad $13,%mm1
|
|
234 packssdw %mm1,%mm1
|
|
235 psubd %mm0,%mm0
|
|
236 psubsw %mm1,%mm0
|
|
237 movd %mm0,%eax
|
|
238 movw %ax,(%edi)
|
|
239 emms
|
|
240
|
|
241 #NO_APP
|
|
242 popl %ebx
|
|
243 popl %esi
|
|
244 popl %edi
|
|
245 addl $4,%esp
|
|
246 popl %ebp
|
|
247 ret
|