562
|
1 /*
|
|
2 decode.c: decoding samples...
|
|
3
|
|
4 copyright 1995-2006 by the mpg123 project - free software under the terms of the LGPL 2.1
|
|
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
|
|
6 initially written by Michael Hipp
|
|
7 altivec optimization by tmkk
|
|
8 */
|
|
9
|
|
10 #include "mpg123lib_intern.h"
|
|
11
|
|
12 #ifndef __APPLE__
|
|
13 #include <altivec.h>
|
|
14 #endif
|
|
15
|
|
16 #define WRITE_SAMPLE(samples,sum,clip) \
|
|
17 if( (sum) > REAL_PLUS_32767) { *(samples) = 0x7fff; (clip)++; } \
|
|
18 else if( (sum) < REAL_MINUS_32768) { *(samples) = -0x8000; (clip)++; } \
|
|
19 else { *(samples) = REAL_TO_SHORT(sum); }
|
|
20
|
|
21 int synth_1to1_8bit_altivec(real *bandPtr,int channel, mpg123_handle *fr, int final)
|
|
22 {
|
|
23 short samples_tmp[64];
|
|
24 short *tmp1 = samples_tmp + channel;
|
|
25 int i,ret;
|
|
26
|
|
27 unsigned char *samples = fr->buffer.data;
|
|
28 int pnt = fr->buffer.fill;
|
|
29 fr->buffer.data = (unsigned char*) samples_tmp;
|
|
30 fr->buffer.fill = 0;
|
|
31 ret = synth_1to1_altivec(bandPtr, channel, fr, 0);
|
|
32 fr->buffer.data = samples;
|
|
33
|
|
34 samples += channel + pnt;
|
|
35 for(i=0;i<32;i++) {
|
|
36 *samples = fr->conv16to8[*tmp1>>AUSHIFT];
|
|
37 samples += 2;
|
|
38 tmp1 += 2;
|
|
39 }
|
|
40 fr->buffer.fill = pnt + (final ? 64 : 0 );
|
|
41
|
|
42 return ret;
|
|
43 }
|
|
44
|
|
45 int synth_1to1_8bit_mono_altivec(real *bandPtr, mpg123_handle *fr)
|
|
46 {
|
|
47 sample_t samples_tmp[64];
|
|
48 sample_t *tmp1 = samples_tmp;
|
|
49 int i,ret;
|
|
50
|
|
51 /* save buffer stuff, trick samples_tmp into there, decode, restore */
|
|
52 unsigned char *samples = fr->buffer.data;
|
|
53 int pnt = fr->buffer.fill;
|
|
54 fr->buffer.data = (unsigned char*) samples_tmp;
|
|
55 fr->buffer.fill = 0;
|
|
56 ret = synth_1to1_altivec(bandPtr,0, fr, 0);
|
|
57 fr->buffer.data = samples; /* restore original value */
|
|
58
|
|
59 samples += pnt;
|
|
60 for(i=0;i<32;i++) {
|
|
61 #ifdef FLOATOUT
|
|
62 *samples++ = 0;
|
|
63 #else
|
|
64 *samples++ = fr->conv16to8[*tmp1>>AUSHIFT];
|
|
65 #endif
|
|
66 tmp1 += 2;
|
|
67 }
|
|
68 fr->buffer.fill = pnt + 32;
|
|
69
|
|
70 return ret;
|
|
71 }
|
|
72
|
|
73 int synth_1to1_8bit_mono2stereo_altivec(real *bandPtr, mpg123_handle *fr)
|
|
74 {
|
|
75 sample_t samples_tmp[64];
|
|
76 sample_t *tmp1 = samples_tmp;
|
|
77 int i,ret;
|
|
78
|
|
79 /* save buffer stuff, trick samples_tmp into there, decode, restore */
|
|
80 unsigned char *samples = fr->buffer.data;
|
|
81 int pnt = fr->buffer.fill;
|
|
82 fr->buffer.data = (unsigned char*) samples_tmp;
|
|
83 fr->buffer.fill = 0;
|
|
84 ret = synth_1to1_altivec(bandPtr, 0, fr, 0);
|
|
85 fr->buffer.data = samples; /* restore original value */
|
|
86
|
|
87 samples += pnt;
|
|
88 for(i=0;i<32;i++) {
|
|
89 #ifdef FLOATOUT
|
|
90 *samples++ = 0;
|
|
91 *samples++ = 0;
|
|
92 #else
|
|
93 *samples++ = fr->conv16to8[*tmp1>>AUSHIFT];
|
|
94 *samples++ = fr->conv16to8[*tmp1>>AUSHIFT];
|
|
95 #endif
|
|
96 tmp1 += 2;
|
|
97 }
|
|
98 fr->buffer.fill = pnt + 64;
|
|
99
|
|
100 return ret;
|
|
101 }
|
|
102
|
|
103 int synth_1to1_mono_altivec(real *bandPtr, mpg123_handle *fr)
|
|
104 {
|
|
105 sample_t samples_tmp[64];
|
|
106 sample_t *tmp1 = samples_tmp;
|
|
107 int i,ret;
|
|
108
|
|
109 /* save buffer stuff, trick samples_tmp into there, decode, restore */
|
|
110 unsigned char *samples = fr->buffer.data;
|
|
111 int pnt = fr->buffer.fill;
|
|
112 fr->buffer.data = (unsigned char*) samples_tmp;
|
|
113 fr->buffer.fill = 0;
|
|
114 ret = synth_1to1_altivec(bandPtr, 0, fr, 0); /* decode into samples_tmp */
|
|
115 fr->buffer.data = samples; /* restore original value */
|
|
116
|
|
117 /* now append samples from samples_tmp */
|
|
118 samples += pnt; /* just the next mem in frame buffer */
|
|
119 for(i=0;i<32;i++){
|
|
120 *( (sample_t *)samples) = *tmp1;
|
|
121 samples += sizeof(sample_t);
|
|
122 tmp1 += 2;
|
|
123 }
|
|
124 fr->buffer.fill = pnt + 32*sizeof(sample_t);
|
|
125
|
|
126 return ret;
|
|
127 }
|
|
128
|
|
129
|
|
130 int synth_1to1_mono2stereo_altivec(real *bandPtr, mpg123_handle *fr)
|
|
131 {
|
|
132 int i,ret;
|
|
133 unsigned char *samples = fr->buffer.data;
|
|
134
|
|
135 ret = synth_1to1_altivec(bandPtr,0,fr,1);
|
|
136 samples += fr->buffer.fill - 64*sizeof(sample_t);
|
|
137
|
|
138 for(i=0;i<32;i++) {
|
|
139 ((sample_t *)samples)[1] = ((sample_t *)samples)[0];
|
|
140 samples+=2*sizeof(sample_t);
|
|
141 }
|
|
142
|
|
143 return ret;
|
|
144 }
|
|
145
|
|
146
|
|
147 int synth_1to1_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final)
|
|
148 {
|
|
149 static const int step = 2;
|
|
150 sample_t *samples = (sample_t *) (fr->buffer.data+fr->buffer.fill);
|
|
151
|
|
152 real *b0, **buf;
|
|
153 int clip = 0;
|
|
154 int bo1;
|
|
155
|
|
156 if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer);
|
|
157
|
|
158 if(!channel) {
|
|
159 fr->bo[0]--;
|
|
160 fr->bo[0] &= 0xf;
|
|
161 buf = fr->real_buffs[0];
|
|
162 }
|
|
163 else {
|
|
164 samples++;
|
|
165 buf = fr->real_buffs[1];
|
|
166 }
|
|
167
|
|
168 if(fr->bo[0] & 0x1) {
|
|
169 b0 = buf[0];
|
|
170 bo1 = fr->bo[0];
|
|
171 dct64_altivec(buf[1]+((fr->bo[0]+1)&0xf),buf[0]+fr->bo[0],bandPtr);
|
|
172 }
|
|
173 else {
|
|
174 b0 = buf[1];
|
|
175 bo1 = fr->bo[0]+1;
|
|
176 dct64_altivec(buf[0]+fr->bo[0],buf[1]+fr->bo[0]+1,bandPtr);
|
|
177 }
|
|
178
|
|
179
|
|
180 {
|
|
181 register int j;
|
|
182 real *window = opt_decwin(fr) + 16 - bo1;
|
|
183
|
|
184 ALIGNED(16) int clip_tmp[4];
|
|
185 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9;
|
|
186 vector unsigned char vperm1,vperm2,vperm3,vperm4,vperm5;
|
|
187 vector float vsum,vsum2,vsum3,vsum4,vmin,vmax;
|
|
188 vector signed int vclip;
|
|
189 vector signed short vsample1,vsample2;
|
|
190 vclip = vec_xor(vclip,vclip);
|
|
191 #ifdef __APPLE__
|
|
192 vmax = (vector float)(32767.0f);
|
|
193 vmin = (vector float)(-32768.0f);
|
|
194 vperm5 = (vector unsigned char)(0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31);
|
|
195 #else
|
|
196 vmax = (vector float){32767.0f,32767.0f,32767.0f,32767.0f};
|
|
197 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
|
|
198 vperm5 = (vector unsigned char){0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31};
|
|
199 #endif
|
|
200
|
|
201 vperm1 = vec_lvsl(0,window);
|
|
202 vperm3 = vec_lvsl(0,samples);
|
|
203 vperm4 = vec_lvsr(0,samples);
|
|
204 for (j=4;j;j--)
|
|
205 {
|
|
206 vsum = vec_xor(vsum,vsum);
|
|
207 vsum2 = vec_xor(vsum2,vsum2);
|
|
208 vsum3 = vec_xor(vsum3,vsum3);
|
|
209 vsum4 = vec_xor(vsum4,vsum4);
|
|
210 v1 = vec_ld(0,window);
|
|
211 v2 = vec_ld(16,window);
|
|
212 v3 = vec_ld(32,window);
|
|
213 v4 = vec_ld(48,window);
|
|
214 v5 = vec_ld(64,window);
|
|
215 v1 = vec_perm(v1,v2,vperm1);
|
|
216 v6 = vec_ld(0,b0);
|
|
217 v2 = vec_perm(v2,v3,vperm1);
|
|
218 v7 = vec_ld(16,b0);
|
|
219 v3 = vec_perm(v3,v4,vperm1);
|
|
220 v8 = vec_ld(32,b0);
|
|
221 v4 = vec_perm(v4,v5,vperm1);
|
|
222 v9 = vec_ld(48,b0);
|
|
223
|
|
224 vsum = vec_madd(v1,v6,vsum);
|
|
225 vsum = vec_madd(v2,v7,vsum);
|
|
226 vsum = vec_madd(v3,v8,vsum);
|
|
227 vsum = vec_madd(v4,v9,vsum);
|
|
228
|
|
229 window += 32;
|
|
230 b0 += 16;
|
|
231
|
|
232 v1 = vec_ld(0,window);
|
|
233 v2 = vec_ld(16,window);
|
|
234 v3 = vec_ld(32,window);
|
|
235 v4 = vec_ld(48,window);
|
|
236 v5 = vec_ld(64,window);
|
|
237 v1 = vec_perm(v1,v2,vperm1);
|
|
238 v6 = vec_ld(0,b0);
|
|
239 v2 = vec_perm(v2,v3,vperm1);
|
|
240 v7 = vec_ld(16,b0);
|
|
241 v3 = vec_perm(v3,v4,vperm1);
|
|
242 v8 = vec_ld(32,b0);
|
|
243 v4 = vec_perm(v4,v5,vperm1);
|
|
244 v9 = vec_ld(48,b0);
|
|
245
|
|
246 vsum2 = vec_madd(v1,v6,vsum2);
|
|
247 vsum2 = vec_madd(v2,v7,vsum2);
|
|
248 vsum2 = vec_madd(v3,v8,vsum2);
|
|
249 vsum2 = vec_madd(v4,v9,vsum2);
|
|
250
|
|
251 window += 32;
|
|
252 b0 += 16;
|
|
253
|
|
254 v1 = vec_ld(0,window);
|
|
255 v2 = vec_ld(16,window);
|
|
256 v3 = vec_ld(32,window);
|
|
257 v4 = vec_ld(48,window);
|
|
258 v5 = vec_ld(64,window);
|
|
259 v1 = vec_perm(v1,v2,vperm1);
|
|
260 v6 = vec_ld(0,b0);
|
|
261 v2 = vec_perm(v2,v3,vperm1);
|
|
262 v7 = vec_ld(16,b0);
|
|
263 v3 = vec_perm(v3,v4,vperm1);
|
|
264 v8 = vec_ld(32,b0);
|
|
265 v4 = vec_perm(v4,v5,vperm1);
|
|
266 v9 = vec_ld(48,b0);
|
|
267
|
|
268 vsum3 = vec_madd(v1,v6,vsum3);
|
|
269 vsum3 = vec_madd(v2,v7,vsum3);
|
|
270 vsum3 = vec_madd(v3,v8,vsum3);
|
|
271 vsum3 = vec_madd(v4,v9,vsum3);
|
|
272
|
|
273 window += 32;
|
|
274 b0 += 16;
|
|
275
|
|
276 v1 = vec_ld(0,window);
|
|
277 v2 = vec_ld(16,window);
|
|
278 v3 = vec_ld(32,window);
|
|
279 v4 = vec_ld(48,window);
|
|
280 v5 = vec_ld(64,window);
|
|
281 v1 = vec_perm(v1,v2,vperm1);
|
|
282 v6 = vec_ld(0,b0);
|
|
283 v2 = vec_perm(v2,v3,vperm1);
|
|
284 v7 = vec_ld(16,b0);
|
|
285 v3 = vec_perm(v3,v4,vperm1);
|
|
286 v8 = vec_ld(32,b0);
|
|
287 v4 = vec_perm(v4,v5,vperm1);
|
|
288 v9 = vec_ld(48,b0);
|
|
289
|
|
290 vsum4 = vec_madd(v1,v6,vsum4);
|
|
291 vsum4 = vec_madd(v2,v7,vsum4);
|
|
292 vsum4 = vec_madd(v3,v8,vsum4);
|
|
293 vsum4 = vec_madd(v4,v9,vsum4);
|
|
294
|
|
295 window += 32;
|
|
296 b0 += 16;
|
|
297
|
|
298 v1 = vec_mergeh(vsum,vsum3);
|
|
299 v2 = vec_mergeh(vsum2,vsum4);
|
|
300 v3 = vec_mergel(vsum,vsum3);
|
|
301 v4 = vec_mergel(vsum2,vsum4);
|
|
302 v5 = vec_mergeh(v1,v2);
|
|
303 v6 = vec_mergel(v1,v2);
|
|
304 v7 = vec_mergeh(v3,v4);
|
|
305 v8 = vec_mergel(v3,v4);
|
|
306
|
|
307 vsum = vec_sub(v5,v6);
|
|
308 v9 = vec_sub(v7,v8);
|
|
309 vsum = vec_add(vsum,v9);
|
|
310
|
|
311 v3 = (vector float)vec_cts(vsum,0);
|
|
312 v1 = (vector float)vec_cmpgt(vsum,vmax);
|
|
313 v2 = (vector float)vec_cmplt(vsum,vmin);
|
|
314 vsample1 = vec_ld(0,samples);
|
|
315 vsample2 = vec_ld(15,samples);
|
|
316 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
|
|
317 v4 = (vector float)vec_perm(vsample1,vsample2,vperm3);
|
|
318 v5 = (vector float)vec_perm(v3,v4,vperm5);
|
|
319 v6 = (vector float)vec_perm(vsample2,vsample1,vperm3);
|
|
320 v7 = (vector float)vec_perm(v5,v6,vperm4);
|
|
321 v8 = (vector float)vec_perm(v6,v5,vperm4);
|
|
322 vec_st((vector signed short)v7,15,samples);
|
|
323 vec_st((vector signed short)v8,0,samples);
|
|
324 samples += 8;
|
|
325 #ifdef __APPLE__
|
|
326 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int)(31));
|
|
327 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int)(31));
|
|
328 #else
|
|
329 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int){31,31,31,31});
|
|
330 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int){31,31,31,31});
|
|
331 #endif
|
|
332 v5 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
|
|
333 vclip = vec_sums((vector signed int)v5,vclip);
|
|
334 }
|
|
335
|
|
336 {
|
|
337 real sum;
|
|
338 sum = REAL_MUL(window[0x0], b0[0x0]);
|
|
339 sum += REAL_MUL(window[0x2], b0[0x2]);
|
|
340 sum += REAL_MUL(window[0x4], b0[0x4]);
|
|
341 sum += REAL_MUL(window[0x6], b0[0x6]);
|
|
342 sum += REAL_MUL(window[0x8], b0[0x8]);
|
|
343 sum += REAL_MUL(window[0xA], b0[0xA]);
|
|
344 sum += REAL_MUL(window[0xC], b0[0xC]);
|
|
345 sum += REAL_MUL(window[0xE], b0[0xE]);
|
|
346 WRITE_SAMPLE(samples,sum,clip);
|
|
347 b0-=0x10,window-=0x20,samples+=step;
|
|
348 }
|
|
349 window += bo1<<1;
|
|
350
|
|
351 vperm1 = vec_lvsl(0,window);
|
|
352 #ifdef __APPLE__
|
|
353 vperm2 = vec_perm(vperm1,vperm1,(vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3));
|
|
354 #else
|
|
355 vperm2 = vec_perm(vperm1,vperm1,(vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3});
|
|
356 #endif
|
|
357 vperm3 = vec_lvsl(0,samples);
|
|
358 vperm4 = vec_lvsr(0,samples);
|
|
359 for (j=3;j;j--)
|
|
360 {
|
|
361 vsum = vec_xor(vsum,vsum);
|
|
362 vsum2 = vec_xor(vsum2,vsum2);
|
|
363 vsum3 = vec_xor(vsum3,vsum3);
|
|
364 vsum4 = vec_xor(vsum4,vsum4);
|
|
365 v1 = vec_ld(-1,window);
|
|
366 v2 = vec_ld(-16,window);
|
|
367 v3 = vec_ld(-32,window);
|
|
368 v4 = vec_ld(-48,window);
|
|
369 v5 = vec_ld(-64,window);
|
|
370 v1 = vec_perm(v2,v1,vperm2);
|
|
371 v6 = vec_ld(0,b0);
|
|
372 v2 = vec_perm(v3,v2,vperm2);
|
|
373 v7 = vec_ld(16,b0);
|
|
374 v3 = vec_perm(v4,v3,vperm2);
|
|
375 v8 = vec_ld(32,b0);
|
|
376 v4 = vec_perm(v5,v4,vperm2);
|
|
377 v9 = vec_ld(48,b0);
|
|
378
|
|
379 vsum = vec_nmsub(v1,v6,vsum);
|
|
380 vsum = vec_nmsub(v2,v7,vsum);
|
|
381 vsum = vec_nmsub(v3,v8,vsum);
|
|
382 vsum = vec_nmsub(v4,v9,vsum);
|
|
383
|
|
384 window -= 32;
|
|
385 b0 -= 16;
|
|
386
|
|
387 v1 = vec_ld(0,window);
|
|
388 v2 = vec_ld(-16,window);
|
|
389 v3 = vec_ld(-32,window);
|
|
390 v4 = vec_ld(-48,window);
|
|
391 v5 = vec_ld(-64,window);
|
|
392 v1 = vec_perm(v2,v1,vperm2);
|
|
393 v6 = vec_ld(0,b0);
|
|
394 v2 = vec_perm(v3,v2,vperm2);
|
|
395 v7 = vec_ld(16,b0);
|
|
396 v3 = vec_perm(v4,v3,vperm2);
|
|
397 v8 = vec_ld(32,b0);
|
|
398 v4 = vec_perm(v5,v4,vperm2);
|
|
399 v9 = vec_ld(48,b0);
|
|
400
|
|
401 vsum2 = vec_nmsub(v1,v6,vsum2);
|
|
402 vsum2 = vec_nmsub(v2,v7,vsum2);
|
|
403 vsum2 = vec_nmsub(v3,v8,vsum2);
|
|
404 vsum2 = vec_nmsub(v4,v9,vsum2);
|
|
405
|
|
406 window -= 32;
|
|
407 b0 -= 16;
|
|
408
|
|
409 v1 = vec_ld(0,window);
|
|
410 v2 = vec_ld(-16,window);
|
|
411 v3 = vec_ld(-32,window);
|
|
412 v4 = vec_ld(-48,window);
|
|
413 v5 = vec_ld(-64,window);
|
|
414 v1 = vec_perm(v2,v1,vperm2);
|
|
415 v6 = vec_ld(0,b0);
|
|
416 v2 = vec_perm(v3,v2,vperm2);
|
|
417 v7 = vec_ld(16,b0);
|
|
418 v3 = vec_perm(v4,v3,vperm2);
|
|
419 v8 = vec_ld(32,b0);
|
|
420 v4 = vec_perm(v5,v4,vperm2);
|
|
421 v9 = vec_ld(48,b0);
|
|
422
|
|
423 vsum3 = vec_nmsub(v1,v6,vsum3);
|
|
424 vsum3 = vec_nmsub(v2,v7,vsum3);
|
|
425 vsum3 = vec_nmsub(v3,v8,vsum3);
|
|
426 vsum3 = vec_nmsub(v4,v9,vsum3);
|
|
427
|
|
428 window -= 32;
|
|
429 b0 -= 16;
|
|
430
|
|
431 v1 = vec_ld(0,window);
|
|
432 v2 = vec_ld(-16,window);
|
|
433 v3 = vec_ld(-32,window);
|
|
434 v4 = vec_ld(-48,window);
|
|
435 v5 = vec_ld(-64,window);
|
|
436 v1 = vec_perm(v2,v1,vperm2);
|
|
437 v6 = vec_ld(0,b0);
|
|
438 v2 = vec_perm(v3,v2,vperm2);
|
|
439 v7 = vec_ld(16,b0);
|
|
440 v3 = vec_perm(v4,v3,vperm2);
|
|
441 v8 = vec_ld(32,b0);
|
|
442 v4 = vec_perm(v5,v4,vperm2);
|
|
443 v9 = vec_ld(48,b0);
|
|
444
|
|
445 vsum4 = vec_nmsub(v1,v6,vsum4);
|
|
446 vsum4 = vec_nmsub(v2,v7,vsum4);
|
|
447 vsum4 = vec_nmsub(v3,v8,vsum4);
|
|
448 vsum4 = vec_nmsub(v4,v9,vsum4);
|
|
449
|
|
450 window -= 32;
|
|
451 b0 -= 16;
|
|
452
|
|
453 v1 = vec_mergeh(vsum,vsum3);
|
|
454 v2 = vec_mergeh(vsum2,vsum4);
|
|
455 v3 = vec_mergel(vsum,vsum3);
|
|
456 v4 = vec_mergel(vsum2,vsum4);
|
|
457 v5 = vec_mergeh(v1,v2);
|
|
458 v6 = vec_mergel(v1,v2);
|
|
459 v7 = vec_mergeh(v3,v4);
|
|
460 v8 = vec_mergel(v3,v4);
|
|
461
|
|
462 vsum = vec_add(v5,v6);
|
|
463 v9 = vec_add(v7,v8);
|
|
464 vsum = vec_add(vsum,v9);
|
|
465
|
|
466 v3 = (vector float)vec_cts(vsum,0);
|
|
467 v1 = (vector float)vec_cmpgt(vsum,vmax);
|
|
468 v2 = (vector float)vec_cmplt(vsum,vmin);
|
|
469 vsample1 = vec_ld(0,samples);
|
|
470 vsample2 = vec_ld(15,samples);
|
|
471 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
|
|
472 v4 = (vector float)vec_perm(vsample1,vsample2,vperm3);
|
|
473 v5 = (vector float)vec_perm(v3,v4,vperm5);
|
|
474 v6 = (vector float)vec_perm(vsample2,vsample1,vperm3);
|
|
475 v7 = (vector float)vec_perm(v5,v6,vperm4);
|
|
476 v8 = (vector float)vec_perm(v6,v5,vperm4);
|
|
477 vec_st((vector signed short)v7,15,samples);
|
|
478 vec_st((vector signed short)v8,0,samples);
|
|
479 samples += 8;
|
|
480 #ifdef __APPLE__
|
|
481 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int)(31));
|
|
482 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int)(31));
|
|
483 #else
|
|
484 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int){31,31,31,31});
|
|
485 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int){31,31,31,31});
|
|
486 #endif
|
|
487 v5 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
|
|
488 vclip = vec_sums((vector signed int)v5,vclip);
|
|
489 }
|
|
490 #ifdef __APPLE__
|
|
491 vperm5 = (vector unsigned char)(0,1,18,19,2,3,22,23,4,5,26,27,28,29,30,31);
|
|
492 #else
|
|
493 vperm5 = (vector unsigned char){0,1,18,19,2,3,22,23,4,5,26,27,28,29,30,31};
|
|
494 #endif
|
|
495 {
|
|
496 vsum = vec_xor(vsum,vsum);
|
|
497 vsum2 = vec_xor(vsum2,vsum2);
|
|
498 vsum3 = vec_xor(vsum3,vsum3);
|
|
499 vsum4 = vec_xor(vsum4,vsum4);
|
|
500 v1 = vec_ld(-1,window);
|
|
501 v2 = vec_ld(-16,window);
|
|
502 v3 = vec_ld(-32,window);
|
|
503 v4 = vec_ld(-48,window);
|
|
504 v5 = vec_ld(-64,window);
|
|
505 v1 = vec_perm(v2,v1,vperm2);
|
|
506 v6 = vec_ld(0,b0);
|
|
507 v2 = vec_perm(v3,v2,vperm2);
|
|
508 v7 = vec_ld(16,b0);
|
|
509 v3 = vec_perm(v4,v3,vperm2);
|
|
510 v8 = vec_ld(32,b0);
|
|
511 v4 = vec_perm(v5,v4,vperm2);
|
|
512 v9 = vec_ld(48,b0);
|
|
513
|
|
514 vsum = vec_nmsub(v1,v6,vsum);
|
|
515 vsum = vec_nmsub(v2,v7,vsum);
|
|
516 vsum = vec_nmsub(v3,v8,vsum);
|
|
517 vsum = vec_nmsub(v4,v9,vsum);
|
|
518
|
|
519 window -= 32;
|
|
520 b0 -= 16;
|
|
521
|
|
522 v1 = vec_ld(0,window);
|
|
523 v2 = vec_ld(-16,window);
|
|
524 v3 = vec_ld(-32,window);
|
|
525 v4 = vec_ld(-48,window);
|
|
526 v5 = vec_ld(-64,window);
|
|
527 v1 = vec_perm(v2,v1,vperm2);
|
|
528 v6 = vec_ld(0,b0);
|
|
529 v2 = vec_perm(v3,v2,vperm2);
|
|
530 v7 = vec_ld(16,b0);
|
|
531 v3 = vec_perm(v4,v3,vperm2);
|
|
532 v8 = vec_ld(32,b0);
|
|
533 v4 = vec_perm(v5,v4,vperm2);
|
|
534 v9 = vec_ld(48,b0);
|
|
535
|
|
536 vsum2 = vec_nmsub(v1,v6,vsum2);
|
|
537 vsum2 = vec_nmsub(v2,v7,vsum2);
|
|
538 vsum2 = vec_nmsub(v3,v8,vsum2);
|
|
539 vsum2 = vec_nmsub(v4,v9,vsum2);
|
|
540
|
|
541 window -= 32;
|
|
542 b0 -= 16;
|
|
543
|
|
544 v1 = vec_ld(0,window);
|
|
545 v2 = vec_ld(-16,window);
|
|
546 v3 = vec_ld(-32,window);
|
|
547 v4 = vec_ld(-48,window);
|
|
548 v5 = vec_ld(-64,window);
|
|
549 v1 = vec_perm(v2,v1,vperm2);
|
|
550 v6 = vec_ld(0,b0);
|
|
551 v2 = vec_perm(v3,v2,vperm2);
|
|
552 v7 = vec_ld(16,b0);
|
|
553 v3 = vec_perm(v4,v3,vperm2);
|
|
554 v8 = vec_ld(32,b0);
|
|
555 v4 = vec_perm(v5,v4,vperm2);
|
|
556 v9 = vec_ld(48,b0);
|
|
557
|
|
558 vsum3 = vec_nmsub(v1,v6,vsum3);
|
|
559 vsum3 = vec_nmsub(v2,v7,vsum3);
|
|
560 vsum3 = vec_nmsub(v3,v8,vsum3);
|
|
561 vsum3 = vec_nmsub(v4,v9,vsum3);
|
|
562
|
|
563 v1 = vec_mergeh(vsum,vsum3);
|
|
564 v2 = vec_mergeh(vsum2,vsum2);
|
|
565 v3 = vec_mergel(vsum,vsum3);
|
|
566 v4 = vec_mergel(vsum2,vsum2);
|
|
567 v5 = vec_mergeh(v1,v2);
|
|
568 v6 = vec_mergel(v1,v2);
|
|
569 v7 = vec_mergeh(v3,v4);
|
|
570 v8 = vec_mergel(v3,v4);
|
|
571
|
|
572 vsum = vec_add(v5,v6);
|
|
573 v9 = vec_add(v7,v8);
|
|
574 vsum = vec_add(vsum,v9);
|
|
575
|
|
576 v3 = (vector float)vec_cts(vsum,0);
|
|
577 v1 = (vector float)vec_cmpgt(vsum,vmax);
|
|
578 v2 = (vector float)vec_cmplt(vsum,vmin);
|
|
579 vsample1 = vec_ld(0,samples);
|
|
580 vsample2 = vec_ld(15,samples);
|
|
581 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
|
|
582 v4 = (vector float)vec_perm(vsample1,vsample2,vperm3);
|
|
583 v5 = (vector float)vec_perm(v3,v4,vperm5);
|
|
584 v6 = (vector float)vec_perm(vsample2,vsample1,vperm3);
|
|
585 v7 = (vector float)vec_perm(v5,v6,vperm4);
|
|
586 v8 = (vector float)vec_perm(v6,v5,vperm4);
|
|
587 vec_st((vector signed short)v7,15,samples);
|
|
588 vec_st((vector signed short)v8,0,samples);
|
|
589 samples += 6;
|
|
590 #ifdef __APPLE__
|
|
591 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int)(31,31,31,32));
|
|
592 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int)(31,31,31,32));
|
|
593 #else
|
|
594 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int){31,31,31,32});
|
|
595 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int){31,31,31,32});
|
|
596 #endif
|
|
597 v5 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
|
|
598 vclip = vec_sums((vector signed int)v5,vclip);
|
|
599 vec_st(vclip,0,clip_tmp);
|
|
600 clip += clip_tmp[3];
|
|
601 }
|
|
602 }
|
|
603 if(final) fr->buffer.fill += 128;
|
|
604
|
|
605 return clip;
|
|
606 }
|