comparison src/video/SDL_blit_N.c @ 1047:ffaaf7ecf685

Altivec-optimized blitters! Vast majority of this work is compliments of Bob Ippolito. http://www.devolution.com/pipermail/sdl/2005-February/067466.html and many other posts.
author Ryan C. Gordon <icculus@icculus.org>
date Sun, 17 Apr 2005 10:19:22 +0000
parents b8d311d90021
children f596fa4f17a6
comparison
equal deleted inserted replaced
1046:f09d5edfc7a3 1047:ffaaf7ecf685
32 #include "SDL_blit.h" 32 #include "SDL_blit.h"
33 #include "SDL_byteorder.h" 33 #include "SDL_byteorder.h"
34 #include "SDL_cpuinfo.h" 34 #include "SDL_cpuinfo.h"
35 35
36 /* Functions to blit from N-bit surfaces to other surfaces */ 36 /* Functions to blit from N-bit surfaces to other surfaces */
37
38 #ifdef USE_ALTIVEC_BLITTERS
39 #include <assert.h>
40 #ifdef MACOSX
41 #include <sys/sysctl.h>
42 #include <stdlib.h>
43 static size_t GetL3CacheSize( void )
44 {
45 const char key[] = "hw.l3cachesize";
46 u_int64_t result = 0;
47 size_t typeSize = sizeof( result );
48
49
50 int err = sysctlbyname( key, &result, &typeSize, NULL, 0 );
51 if( 0 != err ) return 0;
52
53 return result;
54 }
55 #else
56 static size_t GetL3CacheSize( void )
57 {
58 /* XXX: Just guess G4 */
59 return 2097152;
60 }
61 #endif /* MACOSX */
62
63 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
64 #define VSWIZZLE32(a,b,c,d) (vector unsigned char) \
65 ( 0x00+a, 0x00+b, 0x00+c, 0x00+d, \
66 0x04+a, 0x04+b, 0x04+c, 0x04+d, \
67 0x08+a, 0x08+b, 0x08+c, 0x08+d, \
68 0x0C+a, 0x0C+b, 0x0C+c, 0x0C+d )
69
70 #define MAKE8888(dstfmt, r, g, b, a) \
71 ( ((r<<dstfmt->Rshift)&dstfmt->Rmask) | \
72 ((g<<dstfmt->Gshift)&dstfmt->Gmask) | \
73 ((b<<dstfmt->Bshift)&dstfmt->Bmask) | \
74 ((a<<dstfmt->Ashift)&dstfmt->Amask) )
75
76 /*
77 * Data Stream Touch...Altivec cache prefetching.
78 *
79 * Don't use this on a G5...however, the speed boost is very significant
80 * on a G4.
81 */
82 #define DST_CHAN_SRC 1
83 #define DST_CHAN_DEST 2
84
85 /* macro to set DST control word value... */
86 #define DST_CTRL(size, count, stride) \
87 (((size) << 24) | ((count) << 16) | (stride))
88
89 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
90 ? vec_lvsl(0, src) \
91 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
92
93 /* Calculate the permute vector used for 32->32 swizzling */
94 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
95 const SDL_PixelFormat *dstfmt)
96 {
97 /*
98 * We have to assume that the bits that aren't used by other
99 * colors is alpha, and it's one complete byte, since some formats
100 * leave alpha with a zero mask, but we should still swizzle the bits.
101 */
102 /* ARGB */
103 const static struct SDL_PixelFormat default_pixel_format = {
104 NULL, 0, 0,
105 0, 0, 0, 0,
106 16, 8, 0, 24,
107 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
108 0, 0};
109 if (!srcfmt) {
110 srcfmt = &default_pixel_format;
111 }
112 if (!dstfmt) {
113 dstfmt = &default_pixel_format;
114 }
115 vector unsigned char plus = (vector unsigned char)( 0x00, 0x00, 0x00, 0x00,
116 0x04, 0x04, 0x04, 0x04,
117 0x08, 0x08, 0x08, 0x08,
118 0x0C, 0x0C, 0x0C, 0x0C );
119 vector unsigned char vswiz;
120 vector unsigned int srcvec;
121 #define RESHIFT(X) (3 - ((X) >> 3))
122 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
123 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
124 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
125 Uint32 amask;
126 /* Use zero for alpha if either surface doesn't have alpha */
127 if (dstfmt->Amask) {
128 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
129 } else {
130 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
131 }
132 #undef RESHIFT
133 ((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask);
134 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
135 return(vswiz);
136 }
137
138 static void Blit_RGB888_RGB565(SDL_BlitInfo *info);
139 static void Blit_RGB888_RGB565Altivec(SDL_BlitInfo *info) {
140 int height = info->d_height;
141 Uint8 *src = (Uint8 *) info->s_pixels;
142 int srcskip = info->s_skip;
143 Uint8 *dst = (Uint8 *) info->d_pixels;
144 int dstskip = info->d_skip;
145 SDL_PixelFormat *srcfmt = info->src;
146 vector unsigned char valpha = vec_splat_u8(0);
147 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
148 vector unsigned char vgmerge = (vector unsigned char)(
149 0x00, 0x02, 0x00, 0x06,
150 0x00, 0x0a, 0x00, 0x0e,
151 0x00, 0x12, 0x00, 0x16,
152 0x00, 0x1a, 0x00, 0x1e);
153 vector unsigned short v1 = vec_splat_u16(1);
154 vector unsigned short v3 = vec_splat_u16(3);
155 vector unsigned short v3f = (vector unsigned short)(
156 0x003f, 0x003f, 0x003f, 0x003f,
157 0x003f, 0x003f, 0x003f, 0x003f);
158 vector unsigned short vfc = (vector unsigned short)(
159 0x00fc, 0x00fc, 0x00fc, 0x00fc,
160 0x00fc, 0x00fc, 0x00fc, 0x00fc);
161 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
162 vf800 = vec_sl(vf800, vec_splat_u16(8));
163
164 while (height--) {
165 vector unsigned char valigner;
166 vector unsigned char voverflow;
167 vector unsigned char vsrc;
168
169 int width = info->d_width;
170 int extrawidth;
171
172 /* do scalar until we can align... */
173 #define ONE_PIXEL_BLEND(condition, widthvar) \
174 while (condition) { \
175 Uint32 pixel; \
176 unsigned sR, sG, sB, sA; \
177 DISEMBLE_RGBA((Uint8 *)src, 4, srcfmt, pixel, \
178 sR, sG, sB, sA); \
179 *(Uint16 *)(dst) = (((sR << 8) & 0x0000F800) | \
180 ((sG << 3) & 0x000007E0) | \
181 ((sB >> 3) & 0x0000001F)); \
182 dst += 2; \
183 src += 4; \
184 widthvar--; \
185 }
186
187 ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
188
189 /* After all that work, here's the vector part! */
190 extrawidth = (width % 8); /* trailing unaligned stores */
191 width -= extrawidth;
192 vsrc = vec_ld(0, src);
193 valigner = VEC_ALIGNER(src);
194
195 while (width) {
196 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
197 vector unsigned int vsrc1, vsrc2;
198 vector unsigned char vdst;
199
200 voverflow = vec_ld(15, src);
201 vsrc = vec_perm(vsrc, voverflow, valigner);
202 vsrc1 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute);
203 src += 16;
204 vsrc = voverflow;
205 voverflow = vec_ld(15, src);
206 vsrc = vec_perm(vsrc, voverflow, valigner);
207 vsrc2 = (vector unsigned int)vec_perm(vsrc, valpha, vpermute);
208 /* 1555 */
209 vpixel = (vector unsigned short)vec_packpx(vsrc1, vsrc2);
210 vgpixel = (vector unsigned short)vec_perm(vsrc1, vsrc2, vgmerge);
211 vgpixel = vec_and(vgpixel, vfc);
212 vgpixel = vec_sl(vgpixel, v3);
213 vrpixel = vec_sl(vpixel, v1);
214 vrpixel = vec_and(vrpixel, vf800);
215 vbpixel = vec_and(vpixel, v3f);
216 vdst = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
217 /* 565 */
218 vdst = vec_or(vdst, (vector unsigned char)vbpixel);
219 vec_st(vdst, 0, dst);
220
221 width -= 8;
222 src += 16;
223 dst += 16;
224 vsrc = voverflow;
225 }
226
227 assert(width == 0);
228
229
230 /* do scalar until we can align... */
231 ONE_PIXEL_BLEND((extrawidth), extrawidth);
232 #undef ONE_PIXEL_BLEND
233
234 src += srcskip; /* move to next row, accounting for pitch. */
235 dst += dstskip;
236 }
237
238
239 }
240
241 static void Blit_RGB565_32Altivec(SDL_BlitInfo *info) {
242 int height = info->d_height;
243 Uint8 *src = (Uint8 *) info->s_pixels;
244 int srcskip = info->s_skip;
245 Uint8 *dst = (Uint8 *) info->d_pixels;
246 int dstskip = info->d_skip;
247 SDL_PixelFormat *srcfmt = info->src;
248 SDL_PixelFormat *dstfmt = info->dst;
249 unsigned alpha;
250 vector unsigned char valpha;
251 vector unsigned char vpermute;
252 vector unsigned short vf800;
253 vector unsigned int v8 = vec_splat_u32(8);
254 vector unsigned int v16 = vec_add(v8, v8);
255 vector unsigned short v2 = vec_splat_u16(2);
256 vector unsigned short v3 = vec_splat_u16(3);
257 /*
258 0x10 - 0x1f is the alpha
259 0x00 - 0x0e evens are the red
260 0x01 - 0x0f odds are zero
261 */
262 vector unsigned char vredalpha1 = (vector unsigned char)(
263 0x10, 0x00, 0x01, 0x01,
264 0x10, 0x02, 0x01, 0x01,
265 0x10, 0x04, 0x01, 0x01,
266 0x10, 0x06, 0x01, 0x01
267 );
268 vector unsigned char vredalpha2 = (vector unsigned char)(
269 vec_add((vector unsigned int)vredalpha1, vec_sl(v8, v16))
270 );
271 /*
272 0x00 - 0x0f is ARxx ARxx ARxx ARxx
273 0x11 - 0x0f odds are blue
274 */
275 vector unsigned char vblue1 = (vector unsigned char)(
276 0x00, 0x01, 0x02, 0x11,
277 0x04, 0x05, 0x06, 0x13,
278 0x08, 0x09, 0x0a, 0x15,
279 0x0c, 0x0d, 0x0e, 0x17
280 );
281 vector unsigned char vblue2 = (vector unsigned char)(
282 vec_add((vector unsigned int)vblue1, v8)
283 );
284 /*
285 0x00 - 0x0f is ARxB ARxB ARxB ARxB
286 0x10 - 0x0e evens are green
287 */
288 vector unsigned char vgreen1 = (vector unsigned char)(
289 0x00, 0x01, 0x10, 0x03,
290 0x04, 0x05, 0x12, 0x07,
291 0x08, 0x09, 0x14, 0x0b,
292 0x0c, 0x0d, 0x16, 0x0f
293 );
294 vector unsigned char vgreen2 = (vector unsigned char)(
295 vec_add((vector unsigned int)vgreen1, vec_sl(v8, v8))
296 );
297
298
299 assert(srcfmt->BytesPerPixel == 2);
300 assert(dstfmt->BytesPerPixel == 4);
301
302 vf800 = (vector unsigned short)vec_splat_u8(-7);
303 vf800 = vec_sl(vf800, vec_splat_u16(8));
304
305 if (dstfmt->Amask && srcfmt->alpha) {
306 ((unsigned char *)&valpha)[0] = alpha = srcfmt->alpha;
307 valpha = vec_splat(valpha, 0);
308 } else {
309 alpha = 0;
310 valpha = vec_splat_u8(0);
311 }
312
313 vpermute = calc_swizzle32(NULL, dstfmt);
314 while (height--) {
315 vector unsigned char valigner;
316 vector unsigned char voverflow;
317 vector unsigned char vsrc;
318
319 int width = info->d_width;
320 int extrawidth;
321
322 /* do scalar until we can align... */
323 #define ONE_PIXEL_BLEND(condition, widthvar) \
324 while (condition) { \
325 unsigned sR, sG, sB; \
326 unsigned short pixel = *((unsigned short *)src); \
327 sR = (pixel >> 8) & 0xf8; \
328 sG = (pixel >> 3) & 0xfc; \
329 sB = (pixel << 3) & 0xf8; \
330 ASSEMBLE_RGBA(dst, 4, dstfmt, sR, sG, sB, alpha); \
331 src += 2; \
332 dst += 4; \
333 widthvar--; \
334 }
335 ONE_PIXEL_BLEND(((UNALIGNED_PTR(dst)) && (width)), width);
336
337 /* After all that work, here's the vector part! */
338 extrawidth = (width % 8); /* trailing unaligned stores */
339 width -= extrawidth;
340 vsrc = vec_ld(0, src);
341 valigner = VEC_ALIGNER(src);
342
343 while (width) {
344 vector unsigned short vR, vG, vB;
345 vector unsigned char vdst1, vdst2;
346
347 voverflow = vec_ld(15, src);
348 vsrc = vec_perm(vsrc, voverflow, valigner);
349
350 vR = vec_and((vector unsigned short)vsrc, vf800);
351 vB = vec_sl((vector unsigned short)vsrc, v3);
352 vG = vec_sl(vB, v2);
353
354 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha1);
355 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
356 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
357 vdst1 = vec_perm(vdst1, valpha, vpermute);
358 vec_st(vdst1, 0, dst);
359
360 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, valpha, vredalpha2);
361 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
362 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
363 vdst2 = vec_perm(vdst2, valpha, vpermute);
364 vec_st(vdst2, 16, dst);
365
366 width -= 8;
367 dst += 32;
368 src += 16;
369 vsrc = voverflow;
370 }
371
372 assert(width == 0);
373
374
375 /* do scalar until we can align... */
376 ONE_PIXEL_BLEND((extrawidth), extrawidth);
377 #undef ONE_PIXEL_BLEND
378
379 src += srcskip; /* move to next row, accounting for pitch. */
380 dst += dstskip;
381 }
382
383 }
384
385 static void BlitNtoNKey(SDL_BlitInfo *info);
386 static void BlitNtoNKeyCopyAlpha(SDL_BlitInfo *info);
387 static void Blit32to32KeyAltivec(SDL_BlitInfo *info)
388 {
389 int height = info->d_height;
390 Uint32 *srcp = (Uint32 *) info->s_pixels;
391 int srcskip = info->s_skip;
392 Uint32 *dstp = (Uint32 *) info->d_pixels;
393 int dstskip = info->d_skip;
394 SDL_PixelFormat *srcfmt = info->src;
395 int srcbpp = srcfmt->BytesPerPixel;
396 SDL_PixelFormat *dstfmt = info->dst;
397 int dstbpp = dstfmt->BytesPerPixel;
398 int copy_alpha = (srcfmt->Amask && dstfmt->Amask);
399 unsigned alpha = dstfmt->Amask ? srcfmt->alpha : 0;
400 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
401 Uint32 ckey = info->src->colorkey;
402 vector unsigned int valpha;
403 vector unsigned char vpermute;
404 vector unsigned char vzero;
405 vector unsigned int vckey;
406 vector unsigned int vrgbmask;
407 vpermute = calc_swizzle32(srcfmt, dstfmt);
408 if (info->d_width < 16) {
409 if(copy_alpha) {
410 return BlitNtoNKeyCopyAlpha(info);
411 } else {
412 return BlitNtoNKey(info);
413 }
414 }
415 vzero = vec_splat_u8(0);
416 if (alpha) {
417 ((unsigned char *)&valpha)[0] = (unsigned char)alpha;
418 valpha = (vector unsigned int)vec_splat((vector unsigned char)valpha, 0);
419 } else {
420 valpha = (vector unsigned int)vzero;
421 }
422 ckey &= rgbmask;
423 ((unsigned int *)&vckey)[0] = ckey;
424 vckey = vec_splat(vckey, 0);
425 ((unsigned int *)&vrgbmask)[0] = rgbmask;
426 vrgbmask = vec_splat(vrgbmask, 0);
427
428 while (height--) {
429 #define ONE_PIXEL_BLEND(condition, widthvar) \
430 if (copy_alpha) { \
431 while (condition) { \
432 Uint32 pixel; \
433 unsigned sR, sG, sB, sA; \
434 DISEMBLE_RGBA((Uint8 *)srcp, srcbpp, srcfmt, pixel, \
435 sR, sG, sB, sA); \
436 if ( (pixel & rgbmask) != ckey ) { \
437 ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
438 sR, sG, sB, sA); \
439 } \
440 ((Uint8 *)dstp) += dstbpp; \
441 ((Uint8 *)srcp) += srcbpp; \
442 widthvar--; \
443 } \
444 } else { \
445 while (condition) { \
446 Uint32 pixel; \
447 unsigned sR, sG, sB; \
448 RETRIEVE_RGB_PIXEL((Uint8 *)srcp, srcbpp, pixel); \
449 if ( pixel != ckey ) { \
450 RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \
451 ASSEMBLE_RGBA((Uint8 *)dstp, dstbpp, dstfmt, \
452 sR, sG, sB, alpha); \
453 } \
454 ((Uint8 *)dstp) += dstbpp; \
455 ((Uint8 *)srcp) += srcbpp; \
456 widthvar--; \
457 } \
458 }
459 int width = info->d_width;
460 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
461 assert(width > 0);
462 if (width > 0) {
463 int extrawidth = (width % 4);
464 vector unsigned char valigner = VEC_ALIGNER(srcp);
465 vector unsigned int vs = vec_ld(0, srcp);
466 width -= extrawidth;
467 assert(width >= 4);
468 while (width) {
469 vector unsigned char vsel;
470 vector unsigned int vd;
471 vector unsigned int voverflow = vec_ld(15, srcp);
472 /* load the source vec */
473 vs = vec_perm(vs, voverflow, valigner);
474 /* vsel is set for items that match the key */
475 vsel = (vector unsigned char)vec_and(vs, vrgbmask);
476 vsel = (vector unsigned char)vec_cmpeq(vs, vckey);
477 /* permute the src vec to the dest format */
478 vs = vec_perm(vs, valpha, vpermute);
479 /* load the destination vec */
480 vd = vec_ld(0, dstp);
481 /* select the source and dest into vs */
482 vd = (vector unsigned int)vec_sel((vector unsigned char)vs, (vector unsigned char)vd, vsel);
483
484 vec_st(vd, 0, dstp);
485 srcp += 4;
486 width -= 4;
487 dstp += 4;
488 vs = voverflow;
489 }
490 ONE_PIXEL_BLEND((extrawidth), extrawidth);
491 #undef ONE_PIXEL_BLEND
492 srcp += srcskip >> 2;
493 dstp += dstskip >> 2;
494 }
495 }
496 }
497
498 /* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
499 /* Use this on a G5 */
500 static void ConvertAltivec32to32_noprefetch(SDL_BlitInfo *info)
501 {
502 int height = info->d_height;
503 Uint32 *src = (Uint32 *) info->s_pixels;
504 int srcskip = info->s_skip;
505 Uint32 *dst = (Uint32 *) info->d_pixels;
506 int dstskip = info->d_skip;
507 SDL_PixelFormat *srcfmt = info->src;
508 int srcbpp = srcfmt->BytesPerPixel;
509 SDL_PixelFormat *dstfmt = info->dst;
510 int dstbpp = dstfmt->BytesPerPixel;
511 vector unsigned int vzero = vec_splat_u32(0);
512 vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
513 if (dstfmt->Amask && !srcfmt->Amask) {
514 if (srcfmt->alpha) {
515 vector unsigned char valpha;
516 ((unsigned char *)&valpha)[0] = srcfmt->alpha;
517 vzero = (vector unsigned int)vec_splat(valpha, 0);
518 }
519 }
520
521 assert(srcbpp == 4);
522 assert(dstbpp == 4);
523
524 while (height--) {
525 vector unsigned char valigner;
526 vector unsigned int vbits;
527 vector unsigned int voverflow;
528 Uint32 bits;
529 Uint8 r, g, b, a;
530
531 int width = info->d_width;
532 int extrawidth;
533
534 /* do scalar until we can align... */
535 while ((UNALIGNED_PTR(dst)) && (width)) {
536 bits = *(src++);
537 RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
538 *(dst++) = MAKE8888(dstfmt, r, g, b, a);
539 width--;
540 }
541
542 /* After all that work, here's the vector part! */
543 extrawidth = (width % 4);
544 width -= extrawidth;
545 valigner = VEC_ALIGNER(src);
546 vbits = vec_ld(0, src);
547
548 while (width) {
549 voverflow = vec_ld(15, src);
550 src += 4;
551 width -= 4;
552 vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
553 vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
554 vec_st(vbits, 0, dst); /* store it back out. */
555 dst += 4;
556 vbits = voverflow;
557 }
558
559 assert(width == 0);
560
561 /* cover pixels at the end of the row that didn't fit in 16 bytes. */
562 while (extrawidth) {
563 bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
564 RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
565 *(dst++) = MAKE8888(dstfmt, r, g, b, a);
566 extrawidth--;
567 }
568
569 src += srcskip >> 2; /* move to next row, accounting for pitch. */
570 dst += dstskip >> 2;
571 }
572
573 }
574
575 /* Altivec code to swizzle one 32-bit surface to a different 32-bit format. */
576 /* Use this on a G4 */
577 static void ConvertAltivec32to32_prefetch(SDL_BlitInfo *info)
578 {
579 const int scalar_dst_lead = sizeof (Uint32) * 4;
580 const int vector_dst_lead = sizeof (Uint32) * 16;
581
582 int height = info->d_height;
583 Uint32 *src = (Uint32 *) info->s_pixels;
584 int srcskip = info->s_skip;
585 Uint32 *dst = (Uint32 *) info->d_pixels;
586 int dstskip = info->d_skip;
587 SDL_PixelFormat *srcfmt = info->src;
588 int srcbpp = srcfmt->BytesPerPixel;
589 SDL_PixelFormat *dstfmt = info->dst;
590 int dstbpp = dstfmt->BytesPerPixel;
591 vector unsigned int vzero = vec_splat_u32(0);
592 vector unsigned char vpermute = calc_swizzle32(srcfmt, dstfmt);
593 if (dstfmt->Amask && !srcfmt->Amask) {
594 if (srcfmt->alpha) {
595 vector unsigned char valpha;
596 ((unsigned char *)&valpha)[0] = srcfmt->alpha;
597 vzero = (vector unsigned int)vec_splat(valpha, 0);
598 }
599 }
600
601 assert(srcbpp == 4);
602 assert(dstbpp == 4);
603
604 while (height--) {
605 vector unsigned char valigner;
606 vector unsigned int vbits;
607 vector unsigned int voverflow;
608 Uint32 bits;
609 Uint8 r, g, b, a;
610
611 int width = info->d_width;
612 int extrawidth;
613
614 /* do scalar until we can align... */
615 while ((UNALIGNED_PTR(dst)) && (width)) {
616 vec_dstt(src+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC);
617 vec_dstst(dst+scalar_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST);
618 bits = *(src++);
619 RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
620 *(dst++) = MAKE8888(dstfmt, r, g, b, a);
621 width--;
622 }
623
624 /* After all that work, here's the vector part! */
625 extrawidth = (width % 4);
626 width -= extrawidth;
627 valigner = VEC_ALIGNER(src);
628 vbits = vec_ld(0, src);
629
630 while (width) {
631 vec_dstt(src+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_SRC);
632 vec_dstst(dst+vector_dst_lead, DST_CTRL(2,32,1024), DST_CHAN_DEST);
633 voverflow = vec_ld(15, src);
634 src += 4;
635 width -= 4;
636 vbits = vec_perm(vbits, voverflow, valigner); /* src is ready. */
637 vbits = vec_perm(vbits, vzero, vpermute); /* swizzle it. */
638 vec_st(vbits, 0, dst); /* store it back out. */
639 dst += 4;
640 vbits = voverflow;
641 }
642
643 assert(width == 0);
644
645 /* cover pixels at the end of the row that didn't fit in 16 bytes. */
646 while (extrawidth) {
647 bits = *(src++); /* max 7 pixels, don't bother with prefetch. */
648 RGBA_FROM_8888(bits, srcfmt, r, g, b, a);
649 *(dst++) = MAKE8888(dstfmt, r, g, b, a);
650 extrawidth--;
651 }
652
653 src += srcskip >> 2; /* move to next row, accounting for pitch. */
654 dst += dstskip >> 2;
655 }
656
657 vec_dss(DST_CHAN_SRC);
658 vec_dss(DST_CHAN_DEST);
659 }
660
661 static Uint32 GetBlitFeatures( void )
662 {
663 static Uint32 features = 0xffffffff;
664 if (features == 0xffffffff) {
665 /* Provide an override for testing .. */
666 char *override = getenv("SDL_ALTIVEC_BLIT_FEATURES");
667 if (override) {
668 features = 0;
669 sscanf(override, "%u", &features);
670 } else {
671 features = ( 0
672 /* Feature 1 is has-MMX */
673 | ((SDL_HasMMX()) ? 1 : 0)
674 /* Feature 2 is has-AltiVec */
675 | ((SDL_HasAltiVec()) ? 2 : 0)
676 /* Feature 4 is dont-use-prefetch */
677 | ((GetL3CacheSize() == 0) ? 4 : 0)
678 );
679 }
680 }
681 return features;
682 }
683 #else
684 /* Feature 1 is has-MMX */
685 #define GetBlitFeatures() ((Uint32)(SDL_HasMMX() ? 1 : 0))
686 #endif
37 687
38 #ifdef USE_ASMBLIT 688 #ifdef USE_ASMBLIT
39 689
40 /* Heheheh, we coerce Hermes into using SDL blit information */ 690 /* Heheheh, we coerce Hermes into using SDL blit information */
41 #define X86_ASSEMBLER 691 #define X86_ASSEMBLER
404 1054
405 #endif /* USE_ASMBLIT */ 1055 #endif /* USE_ASMBLIT */
406 1056
407 1057
408 /* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */ 1058 /* Special optimized blit for RGB 5-6-5 --> 32-bit RGB surfaces */
409 #if ( SDL_BYTEORDER == SDL_LIL_ENDIAN ) 1059 #define RGB565_32(dst, src, map) (map[src[LO]*2] + map[src[HI]*2+1])
410 #define RGB565_32(dst, src, map) (map[src[0]*2] + map[src[1]*2+1])
411 #else /* ( SDL_BYTEORDER == SDL_BIG_ENDIAN ) */
412 #define RGB565_32(dst, src, map) (map[src[1]*2] + map[src[0]*2+1])
413 #endif
414 static void Blit_RGB565_32(SDL_BlitInfo *info, const Uint32 *map) 1060 static void Blit_RGB565_32(SDL_BlitInfo *info, const Uint32 *map)
415 { 1061 {
416 #ifndef USE_DUFFS_LOOP 1062 #ifndef USE_DUFFS_LOOP
417 int c; 1063 int c;
418 #endif 1064 #endif
1420 /* Normal N to N optimized blitters */ 2066 /* Normal N to N optimized blitters */
1421 struct blit_table { 2067 struct blit_table {
1422 Uint32 srcR, srcG, srcB; 2068 Uint32 srcR, srcG, srcB;
1423 int dstbpp; 2069 int dstbpp;
1424 Uint32 dstR, dstG, dstB; 2070 Uint32 dstR, dstG, dstB;
1425 SDL_bool cpu_mmx; 2071 Uint32 blit_features;
1426 void *aux_data; 2072 void *aux_data;
1427 SDL_loblit blitfunc; 2073 SDL_loblit blitfunc;
1428 enum { NO_ALPHA, SET_ALPHA, COPY_ALPHA } alpha; 2074 enum { NO_ALPHA=1, SET_ALPHA=2, COPY_ALPHA=4 } alpha;
1429 }; 2075 };
1430 static const struct blit_table normal_blit_1[] = { 2076 static const struct blit_table normal_blit_1[] = {
1431 /* Default for 8-bit RGB source, an invalid combination */ 2077 /* Default for 8-bit RGB source, an invalid combination */
1432 { 0,0,0, 0, 0,0,0, 0, NULL, NULL }, 2078 { 0,0,0, 0, 0,0,0, 0, NULL, NULL },
1433 }; 2079 };
1437 0, ConvertX86p16_16BGR565, ConvertX86, NO_ALPHA }, 2083 0, ConvertX86p16_16BGR565, ConvertX86, NO_ALPHA },
1438 { 0x0000F800,0x000007E0,0x0000001F, 2, 0x00007C00,0x000003E0,0x0000001F, 2084 { 0x0000F800,0x000007E0,0x0000001F, 2, 0x00007C00,0x000003E0,0x0000001F,
1439 0, ConvertX86p16_16RGB555, ConvertX86, NO_ALPHA }, 2085 0, ConvertX86p16_16RGB555, ConvertX86, NO_ALPHA },
1440 { 0x0000F800,0x000007E0,0x0000001F, 2, 0x0000001F,0x000003E0,0x00007C00, 2086 { 0x0000F800,0x000007E0,0x0000001F, 2, 0x0000001F,0x000003E0,0x00007C00,
1441 0, ConvertX86p16_16BGR555, ConvertX86, NO_ALPHA }, 2087 0, ConvertX86p16_16BGR555, ConvertX86, NO_ALPHA },
2088 #endif
2089 #ifdef USE_ALTIVEC_BLITTERS
2090 /* has-altivec */
2091 { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00000000,0x00000000,0x00000000,
2092 2, NULL, Blit_RGB565_32Altivec, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
1442 #endif 2093 #endif
1443 { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00FF0000,0x0000FF00,0x000000FF, 2094 { 0x0000F800,0x000007E0,0x0000001F, 4, 0x00FF0000,0x0000FF00,0x000000FF,
1444 0, NULL, Blit_RGB565_ARGB8888, SET_ALPHA }, 2095 0, NULL, Blit_RGB565_ARGB8888, SET_ALPHA },
1445 { 0x0000F800,0x000007E0,0x0000001F, 4, 0x000000FF,0x0000FF00,0x00FF0000, 2096 { 0x0000F800,0x000007E0,0x0000001F, 4, 0x000000FF,0x0000FF00,0x00FF0000,
1446 0, NULL, Blit_RGB565_ABGR8888, SET_ALPHA }, 2097 0, NULL, Blit_RGB565_ABGR8888, SET_ALPHA },
1483 { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0xFF000000,0x00FF0000,0x0000FF00, 2134 { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0xFF000000,0x00FF0000,0x0000FF00,
1484 0, ConvertX86p32_32RGBA888, ConvertX86, NO_ALPHA }, 2135 0, ConvertX86p32_32RGBA888, ConvertX86, NO_ALPHA },
1485 { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x0000FF00,0x00FF0000,0xFF000000, 2136 { 0x00FF0000,0x0000FF00,0x000000FF, 4, 0x0000FF00,0x00FF0000,0xFF000000,
1486 0, ConvertX86p32_32BGRA888, ConvertX86, NO_ALPHA }, 2137 0, ConvertX86p32_32BGRA888, ConvertX86, NO_ALPHA },
1487 #else 2138 #else
2139 #ifdef USE_ALTIVEC_BLITTERS
2140 /* has-altivec | dont-use-prefetch */
2141 { 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000,
2142 6, NULL, ConvertAltivec32to32_noprefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
2143 /* has-altivec */
2144 { 0x00000000,0x00000000,0x00000000, 4, 0x00000000,0x00000000,0x00000000,
2145 2, NULL, ConvertAltivec32to32_prefetch, NO_ALPHA | COPY_ALPHA | SET_ALPHA },
2146 /* has-altivec */
2147 { 0x00000000,0x00000000,0x00000000, 2, 0x0000F800,0x000007E0,0x0000001F,
2148 2, NULL, Blit_RGB888_RGB565Altivec, NO_ALPHA },
2149 #endif
1488 { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x0000F800,0x000007E0,0x0000001F, 2150 { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x0000F800,0x000007E0,0x0000001F,
1489 0, NULL, Blit_RGB888_RGB565, NO_ALPHA }, 2151 0, NULL, Blit_RGB888_RGB565, NO_ALPHA },
1490 { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F, 2152 { 0x00FF0000,0x0000FF00,0x000000FF, 2, 0x00007C00,0x000003E0,0x0000001F,
1491 0, NULL, Blit_RGB888_RGB555, NO_ALPHA }, 2153 0, NULL, Blit_RGB888_RGB555, NO_ALPHA },
1492 #endif 2154 #endif
1495 }; 2157 };
1496 static const struct blit_table *normal_blit[] = { 2158 static const struct blit_table *normal_blit[] = {
1497 normal_blit_1, normal_blit_2, normal_blit_3, normal_blit_4 2159 normal_blit_1, normal_blit_2, normal_blit_3, normal_blit_4
1498 }; 2160 };
1499 2161
2162 /* Mask matches table, or table entry is zero */
2163 #define MASKOK(x, y) (((x) == (y)) || ((y) == 0x00000000))
2164
1500 SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index) 2165 SDL_loblit SDL_CalculateBlitN(SDL_Surface *surface, int blit_index)
1501 { 2166 {
1502 struct private_swaccel *sdata; 2167 struct private_swaccel *sdata;
1503 SDL_PixelFormat *srcfmt; 2168 SDL_PixelFormat *srcfmt;
1504 SDL_PixelFormat *dstfmt; 2169 SDL_PixelFormat *dstfmt;
1530 && surface->map->identity) 2195 && surface->map->identity)
1531 return Blit2to2Key; 2196 return Blit2to2Key;
1532 else if(dstfmt->BytesPerPixel == 1) 2197 else if(dstfmt->BytesPerPixel == 1)
1533 return BlitNto1Key; 2198 return BlitNto1Key;
1534 else { 2199 else {
2200 #ifdef USE_ALTIVEC_BLITTERS
2201 if((srcfmt->BytesPerPixel == 4) && (dstfmt->BytesPerPixel == 4) && SDL_HasAltiVec()) {
2202 return Blit32to32KeyAltivec;
2203 } else
2204 #endif
2205
1535 if(srcfmt->Amask && dstfmt->Amask) 2206 if(srcfmt->Amask && dstfmt->Amask)
1536 return BlitNtoNKeyCopyAlpha; 2207 return BlitNtoNKeyCopyAlpha;
1537 else 2208 else
1538 return BlitNtoNKey; 2209 return BlitNtoNKey;
1539 } 2210 }
1559 } else { 2230 } else {
1560 blitfun = BlitNto1; 2231 blitfun = BlitNto1;
1561 } 2232 }
1562 } else { 2233 } else {
1563 /* Now the meat, choose the blitter we want */ 2234 /* Now the meat, choose the blitter we want */
1564 int a_need = 0; 2235 int a_need = 0;
1565 if(dstfmt->Amask) 2236 if(dstfmt->Amask)
1566 a_need = srcfmt->Amask ? COPY_ALPHA : SET_ALPHA; 2237 a_need = srcfmt->Amask ? COPY_ALPHA : SET_ALPHA;
1567 table = normal_blit[srcfmt->BytesPerPixel-1]; 2238 table = normal_blit[srcfmt->BytesPerPixel-1];
1568 for ( which=0; table[which].srcR; ++which ) { 2239 for ( which=0; table[which].dstbpp; ++which ) {
1569 if ( srcfmt->Rmask == table[which].srcR && 2240 if ( MASKOK(srcfmt->Rmask, table[which].srcR) &&
1570 srcfmt->Gmask == table[which].srcG && 2241 MASKOK(srcfmt->Gmask, table[which].srcG) &&
1571 srcfmt->Bmask == table[which].srcB && 2242 MASKOK(srcfmt->Bmask, table[which].srcB) &&
1572 dstfmt->BytesPerPixel == table[which].dstbpp && 2243 MASKOK(dstfmt->Rmask, table[which].dstR) &&
1573 dstfmt->Rmask == table[which].dstR && 2244 MASKOK(dstfmt->Gmask, table[which].dstG) &&
1574 dstfmt->Gmask == table[which].dstG && 2245 MASKOK(dstfmt->Bmask, table[which].dstB) &&
1575 dstfmt->Bmask == table[which].dstB && 2246 dstfmt->BytesPerPixel == table[which].dstbpp &&
1576 (a_need & table[which].alpha) == a_need && 2247 (a_need & table[which].alpha) == a_need &&
1577 (table[which].cpu_mmx == SDL_HasMMX())) 2248 ((table[which].blit_features & GetBlitFeatures()) == table[which].blit_features) )
1578 break; 2249 break;
1579 } 2250 }
1580 sdata->aux_data = table[which].aux_data; 2251 sdata->aux_data = table[which].aux_data;
1581 blitfun = table[which].blitfunc; 2252 blitfun = table[which].blitfunc;
1582 if(a_need == COPY_ALPHA && blitfun == BlitNtoN) 2253 if(a_need == COPY_ALPHA && blitfun == BlitNtoN)