comparison decoders/libmpg123/decode_altivec.c @ 562:7e08477b0fc1

MP3 decoder upgrade work. Ripped out SMPEG and mpglib support, replaced it with "mpg123.c" and libmpg123. libmpg123 is a much better version of mpglib, so it should solve all the problems about MP3's not seeking, or most modern MP3's not playing at all, etc. Since you no longer have to make a tradeoff with SMPEG for features, and SMPEG is basically rotting, I removed it from the project. There is still work to be done with libmpg123...there are MMX, 3DNow, SSE, Altivec, etc decoders which we don't have enabled at the moment, and the build system could use some work to make this compile more cleanly, etc. Still: huge win.
author Ryan C. Gordon <icculus@icculus.org>
date Fri, 30 Jan 2009 02:44:47 -0500
parents
children
comparison
equal deleted inserted replaced
561:f2985e08589c 562:7e08477b0fc1
1 /*
2 decode.c: decoding samples...
3
4 copyright 1995-2006 by the mpg123 project - free software under the terms of the LGPL 2.1
5 see COPYING and AUTHORS files in distribution or http://mpg123.org
6 initially written by Michael Hipp
7 altivec optimization by tmkk
8 */
9
10 #include "mpg123lib_intern.h"
11
12 #ifndef __APPLE__
13 #include <altivec.h>
14 #endif
15
16 #define WRITE_SAMPLE(samples,sum,clip) \
17 if( (sum) > REAL_PLUS_32767) { *(samples) = 0x7fff; (clip)++; } \
18 else if( (sum) < REAL_MINUS_32768) { *(samples) = -0x8000; (clip)++; } \
19 else { *(samples) = REAL_TO_SHORT(sum); }
20
21 int synth_1to1_8bit_altivec(real *bandPtr,int channel, mpg123_handle *fr, int final)
22 {
23 short samples_tmp[64];
24 short *tmp1 = samples_tmp + channel;
25 int i,ret;
26
27 unsigned char *samples = fr->buffer.data;
28 int pnt = fr->buffer.fill;
29 fr->buffer.data = (unsigned char*) samples_tmp;
30 fr->buffer.fill = 0;
31 ret = synth_1to1_altivec(bandPtr, channel, fr, 0);
32 fr->buffer.data = samples;
33
34 samples += channel + pnt;
35 for(i=0;i<32;i++) {
36 *samples = fr->conv16to8[*tmp1>>AUSHIFT];
37 samples += 2;
38 tmp1 += 2;
39 }
40 fr->buffer.fill = pnt + (final ? 64 : 0 );
41
42 return ret;
43 }
44
45 int synth_1to1_8bit_mono_altivec(real *bandPtr, mpg123_handle *fr)
46 {
47 sample_t samples_tmp[64];
48 sample_t *tmp1 = samples_tmp;
49 int i,ret;
50
51 /* save buffer stuff, trick samples_tmp into there, decode, restore */
52 unsigned char *samples = fr->buffer.data;
53 int pnt = fr->buffer.fill;
54 fr->buffer.data = (unsigned char*) samples_tmp;
55 fr->buffer.fill = 0;
56 ret = synth_1to1_altivec(bandPtr,0, fr, 0);
57 fr->buffer.data = samples; /* restore original value */
58
59 samples += pnt;
60 for(i=0;i<32;i++) {
61 #ifdef FLOATOUT
62 *samples++ = 0;
63 #else
64 *samples++ = fr->conv16to8[*tmp1>>AUSHIFT];
65 #endif
66 tmp1 += 2;
67 }
68 fr->buffer.fill = pnt + 32;
69
70 return ret;
71 }
72
73 int synth_1to1_8bit_mono2stereo_altivec(real *bandPtr, mpg123_handle *fr)
74 {
75 sample_t samples_tmp[64];
76 sample_t *tmp1 = samples_tmp;
77 int i,ret;
78
79 /* save buffer stuff, trick samples_tmp into there, decode, restore */
80 unsigned char *samples = fr->buffer.data;
81 int pnt = fr->buffer.fill;
82 fr->buffer.data = (unsigned char*) samples_tmp;
83 fr->buffer.fill = 0;
84 ret = synth_1to1_altivec(bandPtr, 0, fr, 0);
85 fr->buffer.data = samples; /* restore original value */
86
87 samples += pnt;
88 for(i=0;i<32;i++) {
89 #ifdef FLOATOUT
90 *samples++ = 0;
91 *samples++ = 0;
92 #else
93 *samples++ = fr->conv16to8[*tmp1>>AUSHIFT];
94 *samples++ = fr->conv16to8[*tmp1>>AUSHIFT];
95 #endif
96 tmp1 += 2;
97 }
98 fr->buffer.fill = pnt + 64;
99
100 return ret;
101 }
102
103 int synth_1to1_mono_altivec(real *bandPtr, mpg123_handle *fr)
104 {
105 sample_t samples_tmp[64];
106 sample_t *tmp1 = samples_tmp;
107 int i,ret;
108
109 /* save buffer stuff, trick samples_tmp into there, decode, restore */
110 unsigned char *samples = fr->buffer.data;
111 int pnt = fr->buffer.fill;
112 fr->buffer.data = (unsigned char*) samples_tmp;
113 fr->buffer.fill = 0;
114 ret = synth_1to1_altivec(bandPtr, 0, fr, 0); /* decode into samples_tmp */
115 fr->buffer.data = samples; /* restore original value */
116
117 /* now append samples from samples_tmp */
118 samples += pnt; /* just the next mem in frame buffer */
119 for(i=0;i<32;i++){
120 *( (sample_t *)samples) = *tmp1;
121 samples += sizeof(sample_t);
122 tmp1 += 2;
123 }
124 fr->buffer.fill = pnt + 32*sizeof(sample_t);
125
126 return ret;
127 }
128
129
130 int synth_1to1_mono2stereo_altivec(real *bandPtr, mpg123_handle *fr)
131 {
132 int i,ret;
133 unsigned char *samples = fr->buffer.data;
134
135 ret = synth_1to1_altivec(bandPtr,0,fr,1);
136 samples += fr->buffer.fill - 64*sizeof(sample_t);
137
138 for(i=0;i<32;i++) {
139 ((sample_t *)samples)[1] = ((sample_t *)samples)[0];
140 samples+=2*sizeof(sample_t);
141 }
142
143 return ret;
144 }
145
146
147 int synth_1to1_altivec(real *bandPtr,int channel,mpg123_handle *fr, int final)
148 {
149 static const int step = 2;
150 sample_t *samples = (sample_t *) (fr->buffer.data+fr->buffer.fill);
151
152 real *b0, **buf;
153 int clip = 0;
154 int bo1;
155
156 if(fr->have_eq_settings) do_equalizer(bandPtr,channel,fr->equalizer);
157
158 if(!channel) {
159 fr->bo[0]--;
160 fr->bo[0] &= 0xf;
161 buf = fr->real_buffs[0];
162 }
163 else {
164 samples++;
165 buf = fr->real_buffs[1];
166 }
167
168 if(fr->bo[0] & 0x1) {
169 b0 = buf[0];
170 bo1 = fr->bo[0];
171 dct64_altivec(buf[1]+((fr->bo[0]+1)&0xf),buf[0]+fr->bo[0],bandPtr);
172 }
173 else {
174 b0 = buf[1];
175 bo1 = fr->bo[0]+1;
176 dct64_altivec(buf[0]+fr->bo[0],buf[1]+fr->bo[0]+1,bandPtr);
177 }
178
179
180 {
181 register int j;
182 real *window = opt_decwin(fr) + 16 - bo1;
183
184 ALIGNED(16) int clip_tmp[4];
185 vector float v1,v2,v3,v4,v5,v6,v7,v8,v9;
186 vector unsigned char vperm1,vperm2,vperm3,vperm4,vperm5;
187 vector float vsum,vsum2,vsum3,vsum4,vmin,vmax;
188 vector signed int vclip;
189 vector signed short vsample1,vsample2;
190 vclip = vec_xor(vclip,vclip);
191 #ifdef __APPLE__
192 vmax = (vector float)(32767.0f);
193 vmin = (vector float)(-32768.0f);
194 vperm5 = (vector unsigned char)(0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31);
195 #else
196 vmax = (vector float){32767.0f,32767.0f,32767.0f,32767.0f};
197 vmin = (vector float){-32768.0f,-32768.0f,-32768.0f,-32768.0f};
198 vperm5 = (vector unsigned char){0,1,18,19,2,3,22,23,4,5,26,27,6,7,30,31};
199 #endif
200
201 vperm1 = vec_lvsl(0,window);
202 vperm3 = vec_lvsl(0,samples);
203 vperm4 = vec_lvsr(0,samples);
204 for (j=4;j;j--)
205 {
206 vsum = vec_xor(vsum,vsum);
207 vsum2 = vec_xor(vsum2,vsum2);
208 vsum3 = vec_xor(vsum3,vsum3);
209 vsum4 = vec_xor(vsum4,vsum4);
210 v1 = vec_ld(0,window);
211 v2 = vec_ld(16,window);
212 v3 = vec_ld(32,window);
213 v4 = vec_ld(48,window);
214 v5 = vec_ld(64,window);
215 v1 = vec_perm(v1,v2,vperm1);
216 v6 = vec_ld(0,b0);
217 v2 = vec_perm(v2,v3,vperm1);
218 v7 = vec_ld(16,b0);
219 v3 = vec_perm(v3,v4,vperm1);
220 v8 = vec_ld(32,b0);
221 v4 = vec_perm(v4,v5,vperm1);
222 v9 = vec_ld(48,b0);
223
224 vsum = vec_madd(v1,v6,vsum);
225 vsum = vec_madd(v2,v7,vsum);
226 vsum = vec_madd(v3,v8,vsum);
227 vsum = vec_madd(v4,v9,vsum);
228
229 window += 32;
230 b0 += 16;
231
232 v1 = vec_ld(0,window);
233 v2 = vec_ld(16,window);
234 v3 = vec_ld(32,window);
235 v4 = vec_ld(48,window);
236 v5 = vec_ld(64,window);
237 v1 = vec_perm(v1,v2,vperm1);
238 v6 = vec_ld(0,b0);
239 v2 = vec_perm(v2,v3,vperm1);
240 v7 = vec_ld(16,b0);
241 v3 = vec_perm(v3,v4,vperm1);
242 v8 = vec_ld(32,b0);
243 v4 = vec_perm(v4,v5,vperm1);
244 v9 = vec_ld(48,b0);
245
246 vsum2 = vec_madd(v1,v6,vsum2);
247 vsum2 = vec_madd(v2,v7,vsum2);
248 vsum2 = vec_madd(v3,v8,vsum2);
249 vsum2 = vec_madd(v4,v9,vsum2);
250
251 window += 32;
252 b0 += 16;
253
254 v1 = vec_ld(0,window);
255 v2 = vec_ld(16,window);
256 v3 = vec_ld(32,window);
257 v4 = vec_ld(48,window);
258 v5 = vec_ld(64,window);
259 v1 = vec_perm(v1,v2,vperm1);
260 v6 = vec_ld(0,b0);
261 v2 = vec_perm(v2,v3,vperm1);
262 v7 = vec_ld(16,b0);
263 v3 = vec_perm(v3,v4,vperm1);
264 v8 = vec_ld(32,b0);
265 v4 = vec_perm(v4,v5,vperm1);
266 v9 = vec_ld(48,b0);
267
268 vsum3 = vec_madd(v1,v6,vsum3);
269 vsum3 = vec_madd(v2,v7,vsum3);
270 vsum3 = vec_madd(v3,v8,vsum3);
271 vsum3 = vec_madd(v4,v9,vsum3);
272
273 window += 32;
274 b0 += 16;
275
276 v1 = vec_ld(0,window);
277 v2 = vec_ld(16,window);
278 v3 = vec_ld(32,window);
279 v4 = vec_ld(48,window);
280 v5 = vec_ld(64,window);
281 v1 = vec_perm(v1,v2,vperm1);
282 v6 = vec_ld(0,b0);
283 v2 = vec_perm(v2,v3,vperm1);
284 v7 = vec_ld(16,b0);
285 v3 = vec_perm(v3,v4,vperm1);
286 v8 = vec_ld(32,b0);
287 v4 = vec_perm(v4,v5,vperm1);
288 v9 = vec_ld(48,b0);
289
290 vsum4 = vec_madd(v1,v6,vsum4);
291 vsum4 = vec_madd(v2,v7,vsum4);
292 vsum4 = vec_madd(v3,v8,vsum4);
293 vsum4 = vec_madd(v4,v9,vsum4);
294
295 window += 32;
296 b0 += 16;
297
298 v1 = vec_mergeh(vsum,vsum3);
299 v2 = vec_mergeh(vsum2,vsum4);
300 v3 = vec_mergel(vsum,vsum3);
301 v4 = vec_mergel(vsum2,vsum4);
302 v5 = vec_mergeh(v1,v2);
303 v6 = vec_mergel(v1,v2);
304 v7 = vec_mergeh(v3,v4);
305 v8 = vec_mergel(v3,v4);
306
307 vsum = vec_sub(v5,v6);
308 v9 = vec_sub(v7,v8);
309 vsum = vec_add(vsum,v9);
310
311 v3 = (vector float)vec_cts(vsum,0);
312 v1 = (vector float)vec_cmpgt(vsum,vmax);
313 v2 = (vector float)vec_cmplt(vsum,vmin);
314 vsample1 = vec_ld(0,samples);
315 vsample2 = vec_ld(15,samples);
316 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
317 v4 = (vector float)vec_perm(vsample1,vsample2,vperm3);
318 v5 = (vector float)vec_perm(v3,v4,vperm5);
319 v6 = (vector float)vec_perm(vsample2,vsample1,vperm3);
320 v7 = (vector float)vec_perm(v5,v6,vperm4);
321 v8 = (vector float)vec_perm(v6,v5,vperm4);
322 vec_st((vector signed short)v7,15,samples);
323 vec_st((vector signed short)v8,0,samples);
324 samples += 8;
325 #ifdef __APPLE__
326 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int)(31));
327 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int)(31));
328 #else
329 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int){31,31,31,31});
330 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int){31,31,31,31});
331 #endif
332 v5 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
333 vclip = vec_sums((vector signed int)v5,vclip);
334 }
335
336 {
337 real sum;
338 sum = REAL_MUL(window[0x0], b0[0x0]);
339 sum += REAL_MUL(window[0x2], b0[0x2]);
340 sum += REAL_MUL(window[0x4], b0[0x4]);
341 sum += REAL_MUL(window[0x6], b0[0x6]);
342 sum += REAL_MUL(window[0x8], b0[0x8]);
343 sum += REAL_MUL(window[0xA], b0[0xA]);
344 sum += REAL_MUL(window[0xC], b0[0xC]);
345 sum += REAL_MUL(window[0xE], b0[0xE]);
346 WRITE_SAMPLE(samples,sum,clip);
347 b0-=0x10,window-=0x20,samples+=step;
348 }
349 window += bo1<<1;
350
351 vperm1 = vec_lvsl(0,window);
352 #ifdef __APPLE__
353 vperm2 = vec_perm(vperm1,vperm1,(vector unsigned char)(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3));
354 #else
355 vperm2 = vec_perm(vperm1,vperm1,(vector unsigned char){12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3});
356 #endif
357 vperm3 = vec_lvsl(0,samples);
358 vperm4 = vec_lvsr(0,samples);
359 for (j=3;j;j--)
360 {
361 vsum = vec_xor(vsum,vsum);
362 vsum2 = vec_xor(vsum2,vsum2);
363 vsum3 = vec_xor(vsum3,vsum3);
364 vsum4 = vec_xor(vsum4,vsum4);
365 v1 = vec_ld(-1,window);
366 v2 = vec_ld(-16,window);
367 v3 = vec_ld(-32,window);
368 v4 = vec_ld(-48,window);
369 v5 = vec_ld(-64,window);
370 v1 = vec_perm(v2,v1,vperm2);
371 v6 = vec_ld(0,b0);
372 v2 = vec_perm(v3,v2,vperm2);
373 v7 = vec_ld(16,b0);
374 v3 = vec_perm(v4,v3,vperm2);
375 v8 = vec_ld(32,b0);
376 v4 = vec_perm(v5,v4,vperm2);
377 v9 = vec_ld(48,b0);
378
379 vsum = vec_nmsub(v1,v6,vsum);
380 vsum = vec_nmsub(v2,v7,vsum);
381 vsum = vec_nmsub(v3,v8,vsum);
382 vsum = vec_nmsub(v4,v9,vsum);
383
384 window -= 32;
385 b0 -= 16;
386
387 v1 = vec_ld(0,window);
388 v2 = vec_ld(-16,window);
389 v3 = vec_ld(-32,window);
390 v4 = vec_ld(-48,window);
391 v5 = vec_ld(-64,window);
392 v1 = vec_perm(v2,v1,vperm2);
393 v6 = vec_ld(0,b0);
394 v2 = vec_perm(v3,v2,vperm2);
395 v7 = vec_ld(16,b0);
396 v3 = vec_perm(v4,v3,vperm2);
397 v8 = vec_ld(32,b0);
398 v4 = vec_perm(v5,v4,vperm2);
399 v9 = vec_ld(48,b0);
400
401 vsum2 = vec_nmsub(v1,v6,vsum2);
402 vsum2 = vec_nmsub(v2,v7,vsum2);
403 vsum2 = vec_nmsub(v3,v8,vsum2);
404 vsum2 = vec_nmsub(v4,v9,vsum2);
405
406 window -= 32;
407 b0 -= 16;
408
409 v1 = vec_ld(0,window);
410 v2 = vec_ld(-16,window);
411 v3 = vec_ld(-32,window);
412 v4 = vec_ld(-48,window);
413 v5 = vec_ld(-64,window);
414 v1 = vec_perm(v2,v1,vperm2);
415 v6 = vec_ld(0,b0);
416 v2 = vec_perm(v3,v2,vperm2);
417 v7 = vec_ld(16,b0);
418 v3 = vec_perm(v4,v3,vperm2);
419 v8 = vec_ld(32,b0);
420 v4 = vec_perm(v5,v4,vperm2);
421 v9 = vec_ld(48,b0);
422
423 vsum3 = vec_nmsub(v1,v6,vsum3);
424 vsum3 = vec_nmsub(v2,v7,vsum3);
425 vsum3 = vec_nmsub(v3,v8,vsum3);
426 vsum3 = vec_nmsub(v4,v9,vsum3);
427
428 window -= 32;
429 b0 -= 16;
430
431 v1 = vec_ld(0,window);
432 v2 = vec_ld(-16,window);
433 v3 = vec_ld(-32,window);
434 v4 = vec_ld(-48,window);
435 v5 = vec_ld(-64,window);
436 v1 = vec_perm(v2,v1,vperm2);
437 v6 = vec_ld(0,b0);
438 v2 = vec_perm(v3,v2,vperm2);
439 v7 = vec_ld(16,b0);
440 v3 = vec_perm(v4,v3,vperm2);
441 v8 = vec_ld(32,b0);
442 v4 = vec_perm(v5,v4,vperm2);
443 v9 = vec_ld(48,b0);
444
445 vsum4 = vec_nmsub(v1,v6,vsum4);
446 vsum4 = vec_nmsub(v2,v7,vsum4);
447 vsum4 = vec_nmsub(v3,v8,vsum4);
448 vsum4 = vec_nmsub(v4,v9,vsum4);
449
450 window -= 32;
451 b0 -= 16;
452
453 v1 = vec_mergeh(vsum,vsum3);
454 v2 = vec_mergeh(vsum2,vsum4);
455 v3 = vec_mergel(vsum,vsum3);
456 v4 = vec_mergel(vsum2,vsum4);
457 v5 = vec_mergeh(v1,v2);
458 v6 = vec_mergel(v1,v2);
459 v7 = vec_mergeh(v3,v4);
460 v8 = vec_mergel(v3,v4);
461
462 vsum = vec_add(v5,v6);
463 v9 = vec_add(v7,v8);
464 vsum = vec_add(vsum,v9);
465
466 v3 = (vector float)vec_cts(vsum,0);
467 v1 = (vector float)vec_cmpgt(vsum,vmax);
468 v2 = (vector float)vec_cmplt(vsum,vmin);
469 vsample1 = vec_ld(0,samples);
470 vsample2 = vec_ld(15,samples);
471 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
472 v4 = (vector float)vec_perm(vsample1,vsample2,vperm3);
473 v5 = (vector float)vec_perm(v3,v4,vperm5);
474 v6 = (vector float)vec_perm(vsample2,vsample1,vperm3);
475 v7 = (vector float)vec_perm(v5,v6,vperm4);
476 v8 = (vector float)vec_perm(v6,v5,vperm4);
477 vec_st((vector signed short)v7,15,samples);
478 vec_st((vector signed short)v8,0,samples);
479 samples += 8;
480 #ifdef __APPLE__
481 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int)(31));
482 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int)(31));
483 #else
484 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int){31,31,31,31});
485 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int){31,31,31,31});
486 #endif
487 v5 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
488 vclip = vec_sums((vector signed int)v5,vclip);
489 }
490 #ifdef __APPLE__
491 vperm5 = (vector unsigned char)(0,1,18,19,2,3,22,23,4,5,26,27,28,29,30,31);
492 #else
493 vperm5 = (vector unsigned char){0,1,18,19,2,3,22,23,4,5,26,27,28,29,30,31};
494 #endif
495 {
496 vsum = vec_xor(vsum,vsum);
497 vsum2 = vec_xor(vsum2,vsum2);
498 vsum3 = vec_xor(vsum3,vsum3);
499 vsum4 = vec_xor(vsum4,vsum4);
500 v1 = vec_ld(-1,window);
501 v2 = vec_ld(-16,window);
502 v3 = vec_ld(-32,window);
503 v4 = vec_ld(-48,window);
504 v5 = vec_ld(-64,window);
505 v1 = vec_perm(v2,v1,vperm2);
506 v6 = vec_ld(0,b0);
507 v2 = vec_perm(v3,v2,vperm2);
508 v7 = vec_ld(16,b0);
509 v3 = vec_perm(v4,v3,vperm2);
510 v8 = vec_ld(32,b0);
511 v4 = vec_perm(v5,v4,vperm2);
512 v9 = vec_ld(48,b0);
513
514 vsum = vec_nmsub(v1,v6,vsum);
515 vsum = vec_nmsub(v2,v7,vsum);
516 vsum = vec_nmsub(v3,v8,vsum);
517 vsum = vec_nmsub(v4,v9,vsum);
518
519 window -= 32;
520 b0 -= 16;
521
522 v1 = vec_ld(0,window);
523 v2 = vec_ld(-16,window);
524 v3 = vec_ld(-32,window);
525 v4 = vec_ld(-48,window);
526 v5 = vec_ld(-64,window);
527 v1 = vec_perm(v2,v1,vperm2);
528 v6 = vec_ld(0,b0);
529 v2 = vec_perm(v3,v2,vperm2);
530 v7 = vec_ld(16,b0);
531 v3 = vec_perm(v4,v3,vperm2);
532 v8 = vec_ld(32,b0);
533 v4 = vec_perm(v5,v4,vperm2);
534 v9 = vec_ld(48,b0);
535
536 vsum2 = vec_nmsub(v1,v6,vsum2);
537 vsum2 = vec_nmsub(v2,v7,vsum2);
538 vsum2 = vec_nmsub(v3,v8,vsum2);
539 vsum2 = vec_nmsub(v4,v9,vsum2);
540
541 window -= 32;
542 b0 -= 16;
543
544 v1 = vec_ld(0,window);
545 v2 = vec_ld(-16,window);
546 v3 = vec_ld(-32,window);
547 v4 = vec_ld(-48,window);
548 v5 = vec_ld(-64,window);
549 v1 = vec_perm(v2,v1,vperm2);
550 v6 = vec_ld(0,b0);
551 v2 = vec_perm(v3,v2,vperm2);
552 v7 = vec_ld(16,b0);
553 v3 = vec_perm(v4,v3,vperm2);
554 v8 = vec_ld(32,b0);
555 v4 = vec_perm(v5,v4,vperm2);
556 v9 = vec_ld(48,b0);
557
558 vsum3 = vec_nmsub(v1,v6,vsum3);
559 vsum3 = vec_nmsub(v2,v7,vsum3);
560 vsum3 = vec_nmsub(v3,v8,vsum3);
561 vsum3 = vec_nmsub(v4,v9,vsum3);
562
563 v1 = vec_mergeh(vsum,vsum3);
564 v2 = vec_mergeh(vsum2,vsum2);
565 v3 = vec_mergel(vsum,vsum3);
566 v4 = vec_mergel(vsum2,vsum2);
567 v5 = vec_mergeh(v1,v2);
568 v6 = vec_mergel(v1,v2);
569 v7 = vec_mergeh(v3,v4);
570 v8 = vec_mergel(v3,v4);
571
572 vsum = vec_add(v5,v6);
573 v9 = vec_add(v7,v8);
574 vsum = vec_add(vsum,v9);
575
576 v3 = (vector float)vec_cts(vsum,0);
577 v1 = (vector float)vec_cmpgt(vsum,vmax);
578 v2 = (vector float)vec_cmplt(vsum,vmin);
579 vsample1 = vec_ld(0,samples);
580 vsample2 = vec_ld(15,samples);
581 v3 = (vector float)vec_packs((vector signed int)v3,(vector signed int)v3);
582 v4 = (vector float)vec_perm(vsample1,vsample2,vperm3);
583 v5 = (vector float)vec_perm(v3,v4,vperm5);
584 v6 = (vector float)vec_perm(vsample2,vsample1,vperm3);
585 v7 = (vector float)vec_perm(v5,v6,vperm4);
586 v8 = (vector float)vec_perm(v6,v5,vperm4);
587 vec_st((vector signed short)v7,15,samples);
588 vec_st((vector signed short)v8,0,samples);
589 samples += 6;
590 #ifdef __APPLE__
591 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int)(31,31,31,32));
592 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int)(31,31,31,32));
593 #else
594 v1 = (vector float)vec_sr((vector unsigned int)v1,(vector unsigned int){31,31,31,32});
595 v2 = (vector float)vec_sr((vector unsigned int)v2,(vector unsigned int){31,31,31,32});
596 #endif
597 v5 = (vector float)vec_add((vector unsigned int)v1,(vector unsigned int)v2);
598 vclip = vec_sums((vector signed int)v5,vclip);
599 vec_st(vclip,0,clip_tmp);
600 clip += clip_tmp[3];
601 }
602 }
603 if(final) fr->buffer.fill += 128;
604
605 return clip;
606 }