comparison decoders/libmpg123/dct64_altivec.c @ 562:7e08477b0fc1

MP3 decoder upgrade work. Ripped out SMPEG and mpglib support, replaced it with "mpg123.c" and libmpg123. libmpg123 is a much better version of mpglib, so it should solve all the problems about MP3's not seeking, or most modern MP3's not playing at all, etc. Since you no longer have to make a tradeoff with SMPEG for features, and SMPEG is basically rotting, I removed it from the project. There is still work to be done with libmpg123...there are MMX, 3DNow, SSE, Altivec, etc decoders which we don't have enabled at the moment, and the build system could use some work to make this compile more cleanly, etc. Still: huge win.
author Ryan C. Gordon <icculus@icculus.org>
date Fri, 30 Jan 2009 02:44:47 -0500
parents
children
comparison
equal deleted inserted replaced
561:f2985e08589c 562:7e08477b0fc1
1 /*
2 dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec
3
4 copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Michael Hipp
7 altivec optimization by tmkk
8 */
9
10 /*
11 * Discrete Cosine Tansform (DCT) for subband synthesis
12 *
13 * -funroll-loops (for gcc) will remove the loops for better performance
14 * using loops in the source-code enhances readabillity
15 *
16 *
17 * TODO: write an optimized version for the down-sampling modes
18 * (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero
19 */
20
21 #include "mpg123lib_intern.h"
22
23 #ifndef __APPLE__
24 #include <altivec.h>
25 #endif
26
27 void dct64_altivec(real *out0,real *out1,real *samples)
28 {
29 ALIGNED(16) real bufs[64];
30
31 {
32 register real *b1,*costab;
33
34 vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4;
35 vector float v1,v2,v3,v4,v5,v6,v7,v8;
36 vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8;
37 vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16;
38 vector float vzero;
39 b1 = samples;
40 costab = pnts[0];
41
42 vzero = vec_xor(vzero,vzero);
43 #ifdef __APPLE__
44 vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
45 #else
46 vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
47 #endif
48 vperm1 = vec_lvsl(0,b1);
49 vperm2 = vec_perm(vperm1,vperm1,vinvert);
50
51 v1 = vec_ld(0,b1);
52 v2 = vec_ld(16,b1);
53 v3 = vec_ld(112,b1);
54 v4 = vec_ld(127,b1);
55 v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */
56 v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */
57
58 vbs1 = vec_add(v5,v6);
59 vbs8 = vec_sub(v5,v6);
60
61 v1 = vec_ld(32,b1);
62 v4 = vec_ld(96,b1);
63 v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */
64 v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */
65
66 vbs2 = vec_add(v5,v6);
67 vbs7 = vec_sub(v5,v6);
68
69 v2 = vec_ld(48,b1);
70 v3 = vec_ld(80,b1);
71 v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */
72 v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */
73
74 vbs3 = vec_add(v5,v6);
75 vbs6 = vec_sub(v5,v6);
76
77 v1 = vec_ld(64,b1);
78 v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */
79 v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */
80
81 vbs4 = vec_add(v5,v6);
82 vbs5 = vec_sub(v5,v6);
83
84 v1 = vec_ld(0,costab);
85 vbs8 = vec_madd(vbs8,v1,vzero);
86 v2 = vec_ld(16,costab);
87 vbs7 = vec_madd(vbs7,v2,vzero);
88 v3 = vec_ld(32,costab);
89 vbs6 = vec_madd(vbs6,v3,vzero);
90 v4 = vec_ld(48,costab);
91 vbs5 = vec_madd(vbs5,v4,vzero);
92 vbs6 = vec_perm(vbs6,vbs6,vinvert);
93 vbs5 = vec_perm(vbs5,vbs5,vinvert);
94
95
96 costab = pnts[1];
97
98 v1 = vec_perm(vbs4,vbs4,vinvert);
99 vbs9 = vec_add(vbs1,v1);
100 v3 = vec_sub(vbs1,v1);
101 v5 = vec_ld(0,costab);
102 v2 = vec_perm(vbs3,vbs3,vinvert);
103 vbs10 = vec_add(vbs2,v2);
104 v4 = vec_sub(vbs2,v2);
105 v6 = vec_ld(16,costab);
106 vbs12 = vec_madd(v3,v5,vzero);
107 vbs11 = vec_madd(v4,v6,vzero);
108
109 v7 = vec_sub(vbs7,vbs6);
110 v8 = vec_sub(vbs8,vbs5);
111 vbs13 = vec_add(vbs5,vbs8);
112 vbs14 = vec_add(vbs6,vbs7);
113 vbs15 = vec_madd(v7,v6,vzero);
114 vbs16 = vec_madd(v8,v5,vzero);
115
116
117 costab = pnts[2];
118
119 v1 = vec_perm(vbs10,vbs10,vinvert);
120 v5 = vec_perm(vbs14,vbs14,vinvert);
121 vbs1 = vec_add(v1,vbs9);
122 vbs5 = vec_add(v5,vbs13);
123 v2 = vec_sub(vbs9,v1);
124 v6 = vec_sub(vbs13,v5);
125 v3 = vec_ld(0,costab);
126 vbs11 = vec_perm(vbs11,vbs11,vinvert);
127 vbs15 = vec_perm(vbs15,vbs15,vinvert);
128 vbs3 = vec_add(vbs11,vbs12);
129 vbs7 = vec_add(vbs15,vbs16);
130 v4 = vec_sub(vbs12,vbs11);
131 v7 = vec_sub(vbs16,vbs15);
132 vbs2 = vec_madd(v2,v3,vzero);
133 vbs4 = vec_madd(v4,v3,vzero);
134 vbs6 = vec_madd(v6,v3,vzero);
135 vbs8 = vec_madd(v7,v3,vzero);
136
137 vbs2 = vec_perm(vbs2,vbs2,vinvert);
138 vbs4 = vec_perm(vbs4,vbs4,vinvert);
139 vbs6 = vec_perm(vbs6,vbs6,vinvert);
140 vbs8 = vec_perm(vbs8,vbs8,vinvert);
141
142
143 costab = pnts[3];
144
145 #ifdef __APPLE__
146 vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
147 vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27);
148 vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19);
149 #else
150 vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
151 vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27};
152 vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19};
153 #endif
154 vperm4 = vec_add(vperm3,vec_splat_u8(8));
155
156 v1 = vec_ld(0,costab);
157 v2 = vec_splat(v1,0);
158 v3 = vec_splat(v1,1);
159 v1 = vec_mergeh(v2,v3);
160
161 v2 = vec_perm(vbs1,vbs3,vperm1);
162 v3 = vec_perm(vbs2,vbs4,vperm1);
163 v4 = vec_perm(vbs1,vbs3,vperm2);
164 v5 = vec_perm(vbs2,vbs4,vperm2);
165 v6 = vec_sub(v2,v4);
166 v7 = vec_sub(v3,v5);
167 v2 = vec_add(v2,v4);
168 v3 = vec_add(v3,v5);
169 v4 = vec_madd(v6,v1,vzero);
170 v5 = vec_nmsub(v7,v1,vzero);
171 vbs9 = vec_perm(v2,v4,vperm3);
172 vbs11 = vec_perm(v2,v4,vperm4);
173 vbs10 = vec_perm(v3,v5,vperm3);
174 vbs12 = vec_perm(v3,v5,vperm4);
175
176 v2 = vec_perm(vbs5,vbs7,vperm1);
177 v3 = vec_perm(vbs6,vbs8,vperm1);
178 v4 = vec_perm(vbs5,vbs7,vperm2);
179 v5 = vec_perm(vbs6,vbs8,vperm2);
180 v6 = vec_sub(v2,v4);
181 v7 = vec_sub(v3,v5);
182 v2 = vec_add(v2,v4);
183 v3 = vec_add(v3,v5);
184 v4 = vec_madd(v6,v1,vzero);
185 v5 = vec_nmsub(v7,v1,vzero);
186 vbs13 = vec_perm(v2,v4,vperm3);
187 vbs15 = vec_perm(v2,v4,vperm4);
188 vbs14 = vec_perm(v3,v5,vperm3);
189 vbs16 = vec_perm(v3,v5,vperm4);
190
191
192 costab = pnts[4];
193
194 v1 = vec_lde(0,costab);
195 #ifdef __APPLE__
196 v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f);
197 #else
198 v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f};
199 #endif
200 v3 = vec_splat(v1,0);
201 v1 = vec_madd(v2,v3,vzero);
202
203 v2 = vec_mergeh(vbs9,vbs10);
204 v3 = vec_mergel(vbs9,vbs10);
205 v4 = vec_mergeh(vbs11,vbs12);
206 v5 = vec_mergel(vbs11,vbs12);
207 v6 = vec_mergeh(v2,v3);
208 v7 = vec_mergel(v2,v3);
209 v2 = vec_mergeh(v4,v5);
210 v3 = vec_mergel(v4,v5);
211 v4 = vec_sub(v6,v7);
212 v5 = vec_sub(v2,v3);
213 v6 = vec_add(v6,v7);
214 v7 = vec_add(v2,v3);
215 v2 = vec_madd(v4,v1,vzero);
216 v3 = vec_madd(v5,v1,vzero);
217 vbs1 = vec_mergeh(v6,v2);
218 vbs2 = vec_mergel(v6,v2);
219 vbs3 = vec_mergeh(v7,v3);
220 vbs4 = vec_mergel(v7,v3);
221
222 v2 = vec_mergeh(vbs13,vbs14);
223 v3 = vec_mergel(vbs13,vbs14);
224 v4 = vec_mergeh(vbs15,vbs16);
225 v5 = vec_mergel(vbs15,vbs16);
226 v6 = vec_mergeh(v2,v3);
227 v7 = vec_mergel(v2,v3);
228 v2 = vec_mergeh(v4,v5);
229 v3 = vec_mergel(v4,v5);
230 v4 = vec_sub(v6,v7);
231 v5 = vec_sub(v2,v3);
232 v6 = vec_add(v6,v7);
233 v7 = vec_add(v2,v3);
234 v2 = vec_madd(v4,v1,vzero);
235 v3 = vec_madd(v5,v1,vzero);
236 vbs5 = vec_mergeh(v6,v2);
237 vbs6 = vec_mergel(v6,v2);
238 vbs7 = vec_mergeh(v7,v3);
239 vbs8 = vec_mergel(v7,v3);
240
241 vec_st(vbs1,0,bufs);
242 vec_st(vbs2,16,bufs);
243 vec_st(vbs3,32,bufs);
244 vec_st(vbs4,48,bufs);
245 vec_st(vbs5,64,bufs);
246 vec_st(vbs6,80,bufs);
247 vec_st(vbs7,96,bufs);
248 vec_st(vbs8,112,bufs);
249 vec_st(vbs9,128,bufs);
250 vec_st(vbs10,144,bufs);
251 vec_st(vbs11,160,bufs);
252 vec_st(vbs12,176,bufs);
253 vec_st(vbs13,192,bufs);
254 vec_st(vbs14,208,bufs);
255 vec_st(vbs15,224,bufs);
256 vec_st(vbs16,240,bufs);
257
258
259 }
260
261 {
262 register real *b1;
263 register int i;
264
265 for(b1=bufs,i=8;i;i--,b1+=4)
266 b1[2] += b1[3];
267
268 for(b1=bufs,i=4;i;i--,b1+=8)
269 {
270 b1[4] += b1[6];
271 b1[6] += b1[5];
272 b1[5] += b1[7];
273 }
274
275 for(b1=bufs,i=2;i;i--,b1+=16)
276 {
277 b1[8] += b1[12];
278 b1[12] += b1[10];
279 b1[10] += b1[14];
280 b1[14] += b1[9];
281 b1[9] += b1[13];
282 b1[13] += b1[11];
283 b1[11] += b1[15];
284 }
285 }
286
287
288 out0[0x10*16] = bufs[0];
289 out0[0x10*15] = bufs[16+0] + bufs[16+8];
290 out0[0x10*14] = bufs[8];
291 out0[0x10*13] = bufs[16+8] + bufs[16+4];
292 out0[0x10*12] = bufs[4];
293 out0[0x10*11] = bufs[16+4] + bufs[16+12];
294 out0[0x10*10] = bufs[12];
295 out0[0x10* 9] = bufs[16+12] + bufs[16+2];
296 out0[0x10* 8] = bufs[2];
297 out0[0x10* 7] = bufs[16+2] + bufs[16+10];
298 out0[0x10* 6] = bufs[10];
299 out0[0x10* 5] = bufs[16+10] + bufs[16+6];
300 out0[0x10* 4] = bufs[6];
301 out0[0x10* 3] = bufs[16+6] + bufs[16+14];
302 out0[0x10* 2] = bufs[14];
303 out0[0x10* 1] = bufs[16+14] + bufs[16+1];
304 out0[0x10* 0] = bufs[1];
305
306 out1[0x10* 0] = bufs[1];
307 out1[0x10* 1] = bufs[16+1] + bufs[16+9];
308 out1[0x10* 2] = bufs[9];
309 out1[0x10* 3] = bufs[16+9] + bufs[16+5];
310 out1[0x10* 4] = bufs[5];
311 out1[0x10* 5] = bufs[16+5] + bufs[16+13];
312 out1[0x10* 6] = bufs[13];
313 out1[0x10* 7] = bufs[16+13] + bufs[16+3];
314 out1[0x10* 8] = bufs[3];
315 out1[0x10* 9] = bufs[16+3] + bufs[16+11];
316 out1[0x10*10] = bufs[11];
317 out1[0x10*11] = bufs[16+11] + bufs[16+7];
318 out1[0x10*12] = bufs[7];
319 out1[0x10*13] = bufs[16+7] + bufs[16+15];
320 out1[0x10*14] = bufs[15];
321 out1[0x10*15] = bufs[16+15];
322
323 }
324
325