562
|
1 /*
|
|
2 dct64_altivec.c: Discrete Cosine Tansform (DCT) for Altivec
|
|
3
|
|
4 copyright ?-2006 by the mpg123 project - free software under the terms of the LGPL 2.1
|
|
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
6 initially written by Michael Hipp
|
|
7 altivec optimization by tmkk
|
|
8 */
|
|
9
|
|
10 /*
|
|
11 * Discrete Cosine Tansform (DCT) for subband synthesis
|
|
12 *
|
|
13 * -funroll-loops (for gcc) will remove the loops for better performance
|
|
14 * using loops in the source-code enhances readabillity
|
|
15 *
|
|
16 *
|
|
17 * TODO: write an optimized version for the down-sampling modes
|
|
18 * (in these modes the bands 16-31 (2:1) or 8-31 (4:1) are zero
|
|
19 */
|
|
20
|
|
21 #include "mpg123lib_intern.h"
|
|
22
|
|
23 #ifndef __APPLE__
|
|
24 #include <altivec.h>
|
|
25 #endif
|
|
26
|
|
27 void dct64_altivec(real *out0,real *out1,real *samples)
|
|
28 {
|
|
29 ALIGNED(16) real bufs[64];
|
|
30
|
|
31 {
|
|
32 register real *b1,*costab;
|
|
33
|
|
34 vector unsigned char vinvert,vperm1,vperm2,vperm3,vperm4;
|
|
35 vector float v1,v2,v3,v4,v5,v6,v7,v8;
|
|
36 vector float vbs1,vbs2,vbs3,vbs4,vbs5,vbs6,vbs7,vbs8;
|
|
37 vector float vbs9,vbs10,vbs11,vbs12,vbs13,vbs14,vbs15,vbs16;
|
|
38 vector float vzero;
|
|
39 b1 = samples;
|
|
40 costab = pnts[0];
|
|
41
|
|
42 vzero = vec_xor(vzero,vzero);
|
|
43 #ifdef __APPLE__
|
|
44 vinvert = (vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
|
|
45 #else
|
|
46 vinvert = (vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3};
|
|
47 #endif
|
|
48 vperm1 = vec_lvsl(0,b1);
|
|
49 vperm2 = vec_perm(vperm1,vperm1,vinvert);
|
|
50
|
|
51 v1 = vec_ld(0,b1);
|
|
52 v2 = vec_ld(16,b1);
|
|
53 v3 = vec_ld(112,b1);
|
|
54 v4 = vec_ld(127,b1);
|
|
55 v5 = vec_perm(v1,v2,vperm1); /* b1[0,1,2,3] */
|
|
56 v6 = vec_perm(v3,v4,vperm2); /* b1[31,30,29,28] */
|
|
57
|
|
58 vbs1 = vec_add(v5,v6);
|
|
59 vbs8 = vec_sub(v5,v6);
|
|
60
|
|
61 v1 = vec_ld(32,b1);
|
|
62 v4 = vec_ld(96,b1);
|
|
63 v5 = vec_perm(v2,v1,vperm1); /* b1[4,5,6,7] */
|
|
64 v6 = vec_perm(v4,v3,vperm2); /* b1[27,26,25,24] */
|
|
65
|
|
66 vbs2 = vec_add(v5,v6);
|
|
67 vbs7 = vec_sub(v5,v6);
|
|
68
|
|
69 v2 = vec_ld(48,b1);
|
|
70 v3 = vec_ld(80,b1);
|
|
71 v5 = vec_perm(v1,v2,vperm1); /* b1[8,9,10,11] */
|
|
72 v6 = vec_perm(v3,v4,vperm2); /* b1[23,22,21,20] */
|
|
73
|
|
74 vbs3 = vec_add(v5,v6);
|
|
75 vbs6 = vec_sub(v5,v6);
|
|
76
|
|
77 v1 = vec_ld(64,b1);
|
|
78 v5 = vec_perm(v2,v1,vperm1); /* b1[12,13,14,15] */
|
|
79 v6 = vec_perm(v1,v3,vperm2); /* b1[19,18,17,16] */
|
|
80
|
|
81 vbs4 = vec_add(v5,v6);
|
|
82 vbs5 = vec_sub(v5,v6);
|
|
83
|
|
84 v1 = vec_ld(0,costab);
|
|
85 vbs8 = vec_madd(vbs8,v1,vzero);
|
|
86 v2 = vec_ld(16,costab);
|
|
87 vbs7 = vec_madd(vbs7,v2,vzero);
|
|
88 v3 = vec_ld(32,costab);
|
|
89 vbs6 = vec_madd(vbs6,v3,vzero);
|
|
90 v4 = vec_ld(48,costab);
|
|
91 vbs5 = vec_madd(vbs5,v4,vzero);
|
|
92 vbs6 = vec_perm(vbs6,vbs6,vinvert);
|
|
93 vbs5 = vec_perm(vbs5,vbs5,vinvert);
|
|
94
|
|
95
|
|
96 costab = pnts[1];
|
|
97
|
|
98 v1 = vec_perm(vbs4,vbs4,vinvert);
|
|
99 vbs9 = vec_add(vbs1,v1);
|
|
100 v3 = vec_sub(vbs1,v1);
|
|
101 v5 = vec_ld(0,costab);
|
|
102 v2 = vec_perm(vbs3,vbs3,vinvert);
|
|
103 vbs10 = vec_add(vbs2,v2);
|
|
104 v4 = vec_sub(vbs2,v2);
|
|
105 v6 = vec_ld(16,costab);
|
|
106 vbs12 = vec_madd(v3,v5,vzero);
|
|
107 vbs11 = vec_madd(v4,v6,vzero);
|
|
108
|
|
109 v7 = vec_sub(vbs7,vbs6);
|
|
110 v8 = vec_sub(vbs8,vbs5);
|
|
111 vbs13 = vec_add(vbs5,vbs8);
|
|
112 vbs14 = vec_add(vbs6,vbs7);
|
|
113 vbs15 = vec_madd(v7,v6,vzero);
|
|
114 vbs16 = vec_madd(v8,v5,vzero);
|
|
115
|
|
116
|
|
117 costab = pnts[2];
|
|
118
|
|
119 v1 = vec_perm(vbs10,vbs10,vinvert);
|
|
120 v5 = vec_perm(vbs14,vbs14,vinvert);
|
|
121 vbs1 = vec_add(v1,vbs9);
|
|
122 vbs5 = vec_add(v5,vbs13);
|
|
123 v2 = vec_sub(vbs9,v1);
|
|
124 v6 = vec_sub(vbs13,v5);
|
|
125 v3 = vec_ld(0,costab);
|
|
126 vbs11 = vec_perm(vbs11,vbs11,vinvert);
|
|
127 vbs15 = vec_perm(vbs15,vbs15,vinvert);
|
|
128 vbs3 = vec_add(vbs11,vbs12);
|
|
129 vbs7 = vec_add(vbs15,vbs16);
|
|
130 v4 = vec_sub(vbs12,vbs11);
|
|
131 v7 = vec_sub(vbs16,vbs15);
|
|
132 vbs2 = vec_madd(v2,v3,vzero);
|
|
133 vbs4 = vec_madd(v4,v3,vzero);
|
|
134 vbs6 = vec_madd(v6,v3,vzero);
|
|
135 vbs8 = vec_madd(v7,v3,vzero);
|
|
136
|
|
137 vbs2 = vec_perm(vbs2,vbs2,vinvert);
|
|
138 vbs4 = vec_perm(vbs4,vbs4,vinvert);
|
|
139 vbs6 = vec_perm(vbs6,vbs6,vinvert);
|
|
140 vbs8 = vec_perm(vbs8,vbs8,vinvert);
|
|
141
|
|
142
|
|
143 costab = pnts[3];
|
|
144
|
|
145 #ifdef __APPLE__
|
|
146 vperm1 = (vector unsigned char)(0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23);
|
|
147 vperm2 = (vector unsigned char)(12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27);
|
|
148 vperm3 = (vector unsigned char)(0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19);
|
|
149 #else
|
|
150 vperm1 = (vector unsigned char){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23};
|
|
151 vperm2 = (vector unsigned char){12,13,14,15,8,9,10,11,28,29,30,31,24,25,26,27};
|
|
152 vperm3 = (vector unsigned char){0,1,2,3,4,5,6,7,20,21,22,23,16,17,18,19};
|
|
153 #endif
|
|
154 vperm4 = vec_add(vperm3,vec_splat_u8(8));
|
|
155
|
|
156 v1 = vec_ld(0,costab);
|
|
157 v2 = vec_splat(v1,0);
|
|
158 v3 = vec_splat(v1,1);
|
|
159 v1 = vec_mergeh(v2,v3);
|
|
160
|
|
161 v2 = vec_perm(vbs1,vbs3,vperm1);
|
|
162 v3 = vec_perm(vbs2,vbs4,vperm1);
|
|
163 v4 = vec_perm(vbs1,vbs3,vperm2);
|
|
164 v5 = vec_perm(vbs2,vbs4,vperm2);
|
|
165 v6 = vec_sub(v2,v4);
|
|
166 v7 = vec_sub(v3,v5);
|
|
167 v2 = vec_add(v2,v4);
|
|
168 v3 = vec_add(v3,v5);
|
|
169 v4 = vec_madd(v6,v1,vzero);
|
|
170 v5 = vec_nmsub(v7,v1,vzero);
|
|
171 vbs9 = vec_perm(v2,v4,vperm3);
|
|
172 vbs11 = vec_perm(v2,v4,vperm4);
|
|
173 vbs10 = vec_perm(v3,v5,vperm3);
|
|
174 vbs12 = vec_perm(v3,v5,vperm4);
|
|
175
|
|
176 v2 = vec_perm(vbs5,vbs7,vperm1);
|
|
177 v3 = vec_perm(vbs6,vbs8,vperm1);
|
|
178 v4 = vec_perm(vbs5,vbs7,vperm2);
|
|
179 v5 = vec_perm(vbs6,vbs8,vperm2);
|
|
180 v6 = vec_sub(v2,v4);
|
|
181 v7 = vec_sub(v3,v5);
|
|
182 v2 = vec_add(v2,v4);
|
|
183 v3 = vec_add(v3,v5);
|
|
184 v4 = vec_madd(v6,v1,vzero);
|
|
185 v5 = vec_nmsub(v7,v1,vzero);
|
|
186 vbs13 = vec_perm(v2,v4,vperm3);
|
|
187 vbs15 = vec_perm(v2,v4,vperm4);
|
|
188 vbs14 = vec_perm(v3,v5,vperm3);
|
|
189 vbs16 = vec_perm(v3,v5,vperm4);
|
|
190
|
|
191
|
|
192 costab = pnts[4];
|
|
193
|
|
194 v1 = vec_lde(0,costab);
|
|
195 #ifdef __APPLE__
|
|
196 v2 = (vector float)(1.0f,-1.0f,1.0f,-1.0f);
|
|
197 #else
|
|
198 v2 = (vector float){1.0f,-1.0f,1.0f,-1.0f};
|
|
199 #endif
|
|
200 v3 = vec_splat(v1,0);
|
|
201 v1 = vec_madd(v2,v3,vzero);
|
|
202
|
|
203 v2 = vec_mergeh(vbs9,vbs10);
|
|
204 v3 = vec_mergel(vbs9,vbs10);
|
|
205 v4 = vec_mergeh(vbs11,vbs12);
|
|
206 v5 = vec_mergel(vbs11,vbs12);
|
|
207 v6 = vec_mergeh(v2,v3);
|
|
208 v7 = vec_mergel(v2,v3);
|
|
209 v2 = vec_mergeh(v4,v5);
|
|
210 v3 = vec_mergel(v4,v5);
|
|
211 v4 = vec_sub(v6,v7);
|
|
212 v5 = vec_sub(v2,v3);
|
|
213 v6 = vec_add(v6,v7);
|
|
214 v7 = vec_add(v2,v3);
|
|
215 v2 = vec_madd(v4,v1,vzero);
|
|
216 v3 = vec_madd(v5,v1,vzero);
|
|
217 vbs1 = vec_mergeh(v6,v2);
|
|
218 vbs2 = vec_mergel(v6,v2);
|
|
219 vbs3 = vec_mergeh(v7,v3);
|
|
220 vbs4 = vec_mergel(v7,v3);
|
|
221
|
|
222 v2 = vec_mergeh(vbs13,vbs14);
|
|
223 v3 = vec_mergel(vbs13,vbs14);
|
|
224 v4 = vec_mergeh(vbs15,vbs16);
|
|
225 v5 = vec_mergel(vbs15,vbs16);
|
|
226 v6 = vec_mergeh(v2,v3);
|
|
227 v7 = vec_mergel(v2,v3);
|
|
228 v2 = vec_mergeh(v4,v5);
|
|
229 v3 = vec_mergel(v4,v5);
|
|
230 v4 = vec_sub(v6,v7);
|
|
231 v5 = vec_sub(v2,v3);
|
|
232 v6 = vec_add(v6,v7);
|
|
233 v7 = vec_add(v2,v3);
|
|
234 v2 = vec_madd(v4,v1,vzero);
|
|
235 v3 = vec_madd(v5,v1,vzero);
|
|
236 vbs5 = vec_mergeh(v6,v2);
|
|
237 vbs6 = vec_mergel(v6,v2);
|
|
238 vbs7 = vec_mergeh(v7,v3);
|
|
239 vbs8 = vec_mergel(v7,v3);
|
|
240
|
|
241 vec_st(vbs1,0,bufs);
|
|
242 vec_st(vbs2,16,bufs);
|
|
243 vec_st(vbs3,32,bufs);
|
|
244 vec_st(vbs4,48,bufs);
|
|
245 vec_st(vbs5,64,bufs);
|
|
246 vec_st(vbs6,80,bufs);
|
|
247 vec_st(vbs7,96,bufs);
|
|
248 vec_st(vbs8,112,bufs);
|
|
249 vec_st(vbs9,128,bufs);
|
|
250 vec_st(vbs10,144,bufs);
|
|
251 vec_st(vbs11,160,bufs);
|
|
252 vec_st(vbs12,176,bufs);
|
|
253 vec_st(vbs13,192,bufs);
|
|
254 vec_st(vbs14,208,bufs);
|
|
255 vec_st(vbs15,224,bufs);
|
|
256 vec_st(vbs16,240,bufs);
|
|
257
|
|
258
|
|
259 }
|
|
260
|
|
261 {
|
|
262 register real *b1;
|
|
263 register int i;
|
|
264
|
|
265 for(b1=bufs,i=8;i;i--,b1+=4)
|
|
266 b1[2] += b1[3];
|
|
267
|
|
268 for(b1=bufs,i=4;i;i--,b1+=8)
|
|
269 {
|
|
270 b1[4] += b1[6];
|
|
271 b1[6] += b1[5];
|
|
272 b1[5] += b1[7];
|
|
273 }
|
|
274
|
|
275 for(b1=bufs,i=2;i;i--,b1+=16)
|
|
276 {
|
|
277 b1[8] += b1[12];
|
|
278 b1[12] += b1[10];
|
|
279 b1[10] += b1[14];
|
|
280 b1[14] += b1[9];
|
|
281 b1[9] += b1[13];
|
|
282 b1[13] += b1[11];
|
|
283 b1[11] += b1[15];
|
|
284 }
|
|
285 }
|
|
286
|
|
287
|
|
288 out0[0x10*16] = bufs[0];
|
|
289 out0[0x10*15] = bufs[16+0] + bufs[16+8];
|
|
290 out0[0x10*14] = bufs[8];
|
|
291 out0[0x10*13] = bufs[16+8] + bufs[16+4];
|
|
292 out0[0x10*12] = bufs[4];
|
|
293 out0[0x10*11] = bufs[16+4] + bufs[16+12];
|
|
294 out0[0x10*10] = bufs[12];
|
|
295 out0[0x10* 9] = bufs[16+12] + bufs[16+2];
|
|
296 out0[0x10* 8] = bufs[2];
|
|
297 out0[0x10* 7] = bufs[16+2] + bufs[16+10];
|
|
298 out0[0x10* 6] = bufs[10];
|
|
299 out0[0x10* 5] = bufs[16+10] + bufs[16+6];
|
|
300 out0[0x10* 4] = bufs[6];
|
|
301 out0[0x10* 3] = bufs[16+6] + bufs[16+14];
|
|
302 out0[0x10* 2] = bufs[14];
|
|
303 out0[0x10* 1] = bufs[16+14] + bufs[16+1];
|
|
304 out0[0x10* 0] = bufs[1];
|
|
305
|
|
306 out1[0x10* 0] = bufs[1];
|
|
307 out1[0x10* 1] = bufs[16+1] + bufs[16+9];
|
|
308 out1[0x10* 2] = bufs[9];
|
|
309 out1[0x10* 3] = bufs[16+9] + bufs[16+5];
|
|
310 out1[0x10* 4] = bufs[5];
|
|
311 out1[0x10* 5] = bufs[16+5] + bufs[16+13];
|
|
312 out1[0x10* 6] = bufs[13];
|
|
313 out1[0x10* 7] = bufs[16+13] + bufs[16+3];
|
|
314 out1[0x10* 8] = bufs[3];
|
|
315 out1[0x10* 9] = bufs[16+3] + bufs[16+11];
|
|
316 out1[0x10*10] = bufs[11];
|
|
317 out1[0x10*11] = bufs[16+11] + bufs[16+7];
|
|
318 out1[0x10*12] = bufs[7];
|
|
319 out1[0x10*13] = bufs[16+7] + bufs[16+15];
|
|
320 out1[0x10*14] = bufs[15];
|
|
321 out1[0x10*15] = bufs[16+15];
|
|
322
|
|
323 }
|
|
324
|
|
325
|