Mercurial > SDL_sound_CoreAudio
comparison decoders/libmpg123/dct64_altivec.c @ 562:7e08477b0fc1
MP3 decoder upgrade work.
Ripped out SMPEG and mpglib support, replaced it with "mpg123.c" and libmpg123.
libmpg123 is a much better version of mpglib, so it should solve all the
problems about MP3's not seeking, or most modern MP3's not playing at all,
etc. Since you no longer have to make a tradeoff with SMPEG for features, and
SMPEG is basically rotting, I removed it from the project.
There is still work to be done with libmpg123...there are MMX, 3DNow, SSE,
Altivec, etc decoders which we don't have enabled at the moment, and the
build system could use some work to make this compile more cleanly, etc.
Still: huge win.
author | Ryan C. Gordon <icculus@icculus.org> |
---|---|
date | Fri, 30 Jan 2009 02:44:47 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
561:f2985e08589c | 562:7e08477b0fc1 |
---|---|
1 /* | |
2 dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec | |
3 | |
4 copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1 | |
5 see COPYING and AUTHORS files in distribution or http://mpg123.org | |
6 initially written by Michael Hipp | |
7 altivec optimization by tmkk | |
8 */ | |
9 | |
10 /* | |
11 * Discrete Cosine Tansform (DCT) for subband synthesis | |
12 * | |
13 * -funroll-loops (for gcc) will remove the loops for better performance | |
14 * using loops in the source-code enhances readabillity | |
15 * | |
16 * | |
17 * TODO: write an optimized version for the down-sampling modes | |
18 * (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero | |
19 */ | |
20 | |
21 #include "mpg123lib_intern.h" | |
22 | |
23 #ifndef __APPLE__ | |
24 #include <altivec.h> | |
25 #endif | |
26 | |
27 void dct64_altivec(real *out0,real *out1,real *samples) | |
28 { | |
29 ALIGNED(16) real bufs[64]; | |
30 | |
31 { | |
32 register real *b1,*costab; | |
33 | |
34 vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4; | |
35 vector float v1,v2,v3,v4,v5,v6,v7,v8; | |
36 vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8; | |
37 vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16; | |
38 vector float vzero; | |
39 b1 = samples; | |
40 costab = pnts[0]; | |
41 | |
42 vzero = vec_xor(vzero,vzero); | |
43 #ifdef __APPLE__ | |
44 vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3); | |
45 #else | |
46 vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3}; | |
47 #endif | |
48 vperm1 = vec_lvsl(0,b1); | |
49 vperm2 = vec_perm(vperm1,vperm1,vinvert); | |
50 | |
51 v1 = vec_ld(0,b1); | |
52 v2 = vec_ld(16,b1); | |
53 v3 = vec_ld(112,b1); | |
54 v4 = vec_ld(127,b1); | |
55 v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */ | |
56 v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */ | |
57 | |
58 vbs1 = vec_add(v5,v6); | |
59 vbs8 = vec_sub(v5,v6); | |
60 | |
61 v1 = vec_ld(32,b1); | |
62 v4 = vec_ld(96,b1); | |
63 v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */ | |
64 v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */ | |
65 | |
66 vbs2 = vec_add(v5,v6); | |
67 vbs7 = vec_sub(v5,v6); | |
68 | |
69 v2 = vec_ld(48,b1); | |
70 v3 = vec_ld(80,b1); | |
71 v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */ | |
72 v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */ | |
73 | |
74 vbs3 = vec_add(v5,v6); | |
75 vbs6 = vec_sub(v5,v6); | |
76 | |
77 v1 = vec_ld(64,b1); | |
78 v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */ | |
79 v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */ | |
80 | |
81 vbs4 = vec_add(v5,v6); | |
82 vbs5 = vec_sub(v5,v6); | |
83 | |
84 v1 = vec_ld(0,costab); | |
85 vbs8 = vec_madd(vbs8,v1,vzero); | |
86 v2 = vec_ld(16,costab); | |
87 vbs7 = vec_madd(vbs7,v2,vzero); | |
88 v3 = vec_ld(32,costab); | |
89 vbs6 = vec_madd(vbs6,v3,vzero); | |
90 v4 = vec_ld(48,costab); | |
91 vbs5 = vec_madd(vbs5,v4,vzero); | |
92 vbs6 = vec_perm(vbs6,vbs6,vinvert); | |
93 vbs5 = vec_perm(vbs5,vbs5,vinvert); | |
94 | |
95 | |
96 costab = pnts[1]; | |
97 | |
98 v1 = vec_perm(vbs4,vbs4,vinvert); | |
99 vbs9 = vec_add(vbs1,v1); | |
100 v3 = vec_sub(vbs1,v1); | |
101 v5 = vec_ld(0,costab); | |
102 v2 = vec_perm(vbs3,vbs3,vinvert); | |
103 vbs10 = vec_add(vbs2,v2); | |
104 v4 = vec_sub(vbs2,v2); | |
105 v6 = vec_ld(16,costab); | |
106 vbs12 = vec_madd(v3,v5,vzero); | |
107 vbs11 = vec_madd(v4,v6,vzero); | |
108 | |
109 v7 = vec_sub(vbs7,vbs6); | |
110 v8 = vec_sub(vbs8,vbs5); | |
111 vbs13 = vec_add(vbs5,vbs8); | |
112 vbs14 = vec_add(vbs6,vbs7); | |
113 vbs15 = vec_madd(v7,v6,vzero); | |
114 vbs16 = vec_madd(v8,v5,vzero); | |
115 | |
116 | |
117 costab = pnts[2]; | |
118 | |
119 v1 = vec_perm(vbs10,vbs10,vinvert); | |
120 v5 = vec_perm(vbs14,vbs14,vinvert); | |
121 vbs1 = vec_add(v1,vbs9); | |
122 vbs5 = vec_add(v5,vbs13); | |
123 v2 = vec_sub(vbs9,v1); | |
124 v6 = vec_sub(vbs13,v5); | |
125 v3 = vec_ld(0,costab); | |
126 vbs11 = vec_perm(vbs11,vbs11,vinvert); | |
127 vbs15 = vec_perm(vbs15,vbs15,vinvert); | |
128 vbs3 = vec_add(vbs11,vbs12); | |
129 vbs7 = vec_add(vbs15,vbs16); | |
130 v4 = vec_sub(vbs12,vbs11); | |
131 v7 = vec_sub(vbs16,vbs15); | |
132 vbs2 = vec_madd(v2,v3,vzero); | |
133 vbs4 = vec_madd(v4,v3,vzero); | |
134 vbs6 = vec_madd(v6,v3,vzero); | |
135 vbs8 = vec_madd(v7,v3,vzero); | |
136 | |
137 vbs2 = vec_perm(vbs2,vbs2,vinvert); | |
138 vbs4 = vec_perm(vbs4,vbs4,vinvert); | |
139 vbs6 = vec_perm(vbs6,vbs6,vinvert); | |
140 vbs8 = vec_perm(vbs8,vbs8,vinvert); | |
141 | |
142 | |
143 costab = pnts[3]; | |
144 | |
145 #ifdef __APPLE__ | |
146 vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23); | |
147 vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27); | |
148 vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19); | |
149 #else | |
150 vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23}; | |
151 vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27}; | |
152 vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19}; | |
153 #endif | |
154 vperm4 = vec_add(vperm3,vec_splat_u8(8)); | |
155 | |
156 v1 = vec_ld(0,costab); | |
157 v2 = vec_splat(v1,0); | |
158 v3 = vec_splat(v1,1); | |
159 v1 = vec_mergeh(v2,v3); | |
160 | |
161 v2 = vec_perm(vbs1,vbs3,vperm1); | |
162 v3 = vec_perm(vbs2,vbs4,vperm1); | |
163 v4 = vec_perm(vbs1,vbs3,vperm2); | |
164 v5 = vec_perm(vbs2,vbs4,vperm2); | |
165 v6 = vec_sub(v2,v4); | |
166 v7 = vec_sub(v3,v5); | |
167 v2 = vec_add(v2,v4); | |
168 v3 = vec_add(v3,v5); | |
169 v4 = vec_madd(v6,v1,vzero); | |
170 v5 = vec_nmsub(v7,v1,vzero); | |
171 vbs9 = vec_perm(v2,v4,vperm3); | |
172 vbs11 = vec_perm(v2,v4,vperm4); | |
173 vbs10 = vec_perm(v3,v5,vperm3); | |
174 vbs12 = vec_perm(v3,v5,vperm4); | |
175 | |
176 v2 = vec_perm(vbs5,vbs7,vperm1); | |
177 v3 = vec_perm(vbs6,vbs8,vperm1); | |
178 v4 = vec_perm(vbs5,vbs7,vperm2); | |
179 v5 = vec_perm(vbs6,vbs8,vperm2); | |
180 v6 = vec_sub(v2,v4); | |
181 v7 = vec_sub(v3,v5); | |
182 v2 = vec_add(v2,v4); | |
183 v3 = vec_add(v3,v5); | |
184 v4 = vec_madd(v6,v1,vzero); | |
185 v5 = vec_nmsub(v7,v1,vzero); | |
186 vbs13 = vec_perm(v2,v4,vperm3); | |
187 vbs15 = vec_perm(v2,v4,vperm4); | |
188 vbs14 = vec_perm(v3,v5,vperm3); | |
189 vbs16 = vec_perm(v3,v5,vperm4); | |
190 | |
191 | |
192 costab = pnts[4]; | |
193 | |
194 v1 = vec_lde(0,costab); | |
195 #ifdef __APPLE__ | |
196 v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f); | |
197 #else | |
198 v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f}; | |
199 #endif | |
200 v3 = vec_splat(v1,0); | |
201 v1 = vec_madd(v2,v3,vzero); | |
202 | |
203 v2 = vec_mergeh(vbs9,vbs10); | |
204 v3 = vec_mergel(vbs9,vbs10); | |
205 v4 = vec_mergeh(vbs11,vbs12); | |
206 v5 = vec_mergel(vbs11,vbs12); | |
207 v6 = vec_mergeh(v2,v3); | |
208 v7 = vec_mergel(v2,v3); | |
209 v2 = vec_mergeh(v4,v5); | |
210 v3 = vec_mergel(v4,v5); | |
211 v4 = vec_sub(v6,v7); | |
212 v5 = vec_sub(v2,v3); | |
213 v6 = vec_add(v6,v7); | |
214 v7 = vec_add(v2,v3); | |
215 v2 = vec_madd(v4,v1,vzero); | |
216 v3 = vec_madd(v5,v1,vzero); | |
217 vbs1 = vec_mergeh(v6,v2); | |
218 vbs2 = vec_mergel(v6,v2); | |
219 vbs3 = vec_mergeh(v7,v3); | |
220 vbs4 = vec_mergel(v7,v3); | |
221 | |
222 v2 = vec_mergeh(vbs13,vbs14); | |
223 v3 = vec_mergel(vbs13,vbs14); | |
224 v4 = vec_mergeh(vbs15,vbs16); | |
225 v5 = vec_mergel(vbs15,vbs16); | |
226 v6 = vec_mergeh(v2,v3); | |
227 v7 = vec_mergel(v2,v3); | |
228 v2 = vec_mergeh(v4,v5); | |
229 v3 = vec_mergel(v4,v5); | |
230 v4 = vec_sub(v6,v7); | |
231 v5 = vec_sub(v2,v3); | |
232 v6 = vec_add(v6,v7); | |
233 v7 = vec_add(v2,v3); | |
234 v2 = vec_madd(v4,v1,vzero); | |
235 v3 = vec_madd(v5,v1,vzero); | |
236 vbs5 = vec_mergeh(v6,v2); | |
237 vbs6 = vec_mergel(v6,v2); | |
238 vbs7 = vec_mergeh(v7,v3); | |
239 vbs8 = vec_mergel(v7,v3); | |
240 | |
241 vec_st(vbs1,0,bufs); | |
242 vec_st(vbs2,16,bufs); | |
243 vec_st(vbs3,32,bufs); | |
244 vec_st(vbs4,48,bufs); | |
245 vec_st(vbs5,64,bufs); | |
246 vec_st(vbs6,80,bufs); | |
247 vec_st(vbs7,96,bufs); | |
248 vec_st(vbs8,112,bufs); | |
249 vec_st(vbs9,128,bufs); | |
250 vec_st(vbs10,144,bufs); | |
251 vec_st(vbs11,160,bufs); | |
252 vec_st(vbs12,176,bufs); | |
253 vec_st(vbs13,192,bufs); | |
254 vec_st(vbs14,208,bufs); | |
255 vec_st(vbs15,224,bufs); | |
256 vec_st(vbs16,240,bufs); | |
257 | |
258 | |
259 } | |
260 | |
261 { | |
262 register real *b1; | |
263 register int i; | |
264 | |
265 for(b1=bufs,i=8;i;i--,b1+=4) | |
266 b1[2] += b1[3]; | |
267 | |
268 for(b1=bufs,i=4;i;i--,b1+=8) | |
269 { | |
270 b1[4] += b1[6]; | |
271 b1[6] += b1[5]; | |
272 b1[5] += b1[7]; | |
273 } | |
274 | |
275 for(b1=bufs,i=2;i;i--,b1+=16) | |
276 { | |
277 b1[8] += b1[12]; | |
278 b1[12] += b1[10]; | |
279 b1[10] += b1[14]; | |
280 b1[14] += b1[9]; | |
281 b1[9] += b1[13]; | |
282 b1[13] += b1[11]; | |
283 b1[11] += b1[15]; | |
284 } | |
285 } | |
286 | |
287 | |
288 out0[0x10*16] = bufs[0]; | |
289 out0[0x10*15] = bufs[16+0] + bufs[16+8]; | |
290 out0[0x10*14] = bufs[8]; | |
291 out0[0x10*13] = bufs[16+8] + bufs[16+4]; | |
292 out0[0x10*12] = bufs[4]; | |
293 out0[0x10*11] = bufs[16+4] + bufs[16+12]; | |
294 out0[0x10*10] = bufs[12]; | |
295 out0[0x10* 9] = bufs[16+12] + bufs[16+2]; | |
296 out0[0x10* 8] = bufs[2]; | |
297 out0[0x10* 7] = bufs[16+2] + bufs[16+10]; | |
298 out0[0x10* 6] = bufs[10]; | |
299 out0[0x10* 5] = bufs[16+10] + bufs[16+6]; | |
300 out0[0x10* 4] = bufs[6]; | |
301 out0[0x10* 3] = bufs[16+6] + bufs[16+14]; | |
302 out0[0x10* 2] = bufs[14]; | |
303 out0[0x10* 1] = bufs[16+14] + bufs[16+1]; | |
304 out0[0x10* 0] = bufs[1]; | |
305 | |
306 out1[0x10* 0] = bufs[1]; | |
307 out1[0x10* 1] = bufs[16+1] + bufs[16+9]; | |
308 out1[0x10* 2] = bufs[9]; | |
309 out1[0x10* 3] = bufs[16+9] + bufs[16+5]; | |
310 out1[0x10* 4] = bufs[5]; | |
311 out1[0x10* 5] = bufs[16+5] + bufs[16+13]; | |
312 out1[0x10* 6] = bufs[13]; | |
313 out1[0x10* 7] = bufs[16+13] + bufs[16+3]; | |
314 out1[0x10* 8] = bufs[3]; | |
315 out1[0x10* 9] = bufs[16+3] + bufs[16+11]; | |
316 out1[0x10*10] = bufs[11]; | |
317 out1[0x10*11] = bufs[16+11] + bufs[16+7]; | |
318 out1[0x10*12] = bufs[7]; | |
319 out1[0x10*13] = bufs[16+7] + bufs[16+15]; | |
320 out1[0x10*14] = bufs[15]; | |
321 out1[0x10*15] = bufs[16+15]; | |
322 | |
323 } | |
324 | |
325 |