comparison src/video/SDL_blit_A.c @ 689:5bb080d35049

Date: Tue, 19 Aug 2003 17:57:00 +0200 From: Stephane Marchesin Subject: Re: [SDL] [patch] MMX alpha blit patches with MMX detection I think everything is correct now. I've done as much testing as I could, but some real-world testing wouldn't hurt, I think. The patch is here : http://icps.u-strasbg.fr/~marchesin/sdl_mmxblit.patch If you do byte-by-byte comparison of the output between C and MMX functions, you'll notice that the results for 555 and 565 RGB alpha blits aren't exactly the same. This is because MMX functions for 555 and 565 RGB have an higher accuracy. If you want the exact same behaviour that's possible by masking the three lower alpha bits in the MMX functions. Just ask ! I removed one MMX function because after I fixed it to match its C equivalent, it revealed to be slower than the C version on a PIII (although a bit faster on an Athlon XP). I've also added MMX and PIII replacements for SDL_memcpy. Those provide some speed up in testvidinfo -benchmark (at least for me, under linux & X11).
author Sam Lantinga <slouken@libsdl.org>
date Fri, 22 Aug 2003 05:51:19 +0000
parents f6ffac90895c
children f90d80d68071
comparison
equal deleted inserted replaced
688:c0522010bb6d 689:5bb080d35049
28 #include <stdio.h> 28 #include <stdio.h>
29 29
30 #include "SDL_types.h" 30 #include "SDL_types.h"
31 #include "SDL_video.h" 31 #include "SDL_video.h"
32 #include "SDL_blit.h" 32 #include "SDL_blit.h"
33
34 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
35 #include "mmx.h"
36 /* Function to check the CPU flags */
37 #define MMX_CPU 0x800000
38 #define TDNOW_CPU 0x80000000
39 #define CPU_Flags() Hermes_X86_CPU()
40 #define X86_ASSEMBLER
41 #define HermesConverterInterface void
42 #define HermesClearInterface void
43 #define STACKCALL
44 #include "HeadX86.h"
45 #endif
33 46
34 /* Functions to perform alpha blended blitting */ 47 /* Functions to perform alpha blended blitting */
35 48
36 /* N->1 blending with per-surface alpha */ 49 /* N->1 blending with per-surface alpha */
37 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info) 50 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info)
193 src += srcskip; 206 src += srcskip;
194 dst += dstskip; 207 dst += dstskip;
195 } 208 }
196 } 209 }
197 210
211 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
212 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
213 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
214 {
215 int width = info->d_width;
216 int height = info->d_height;
217 Uint32 *srcp = (Uint32 *)info->s_pixels;
218 int srcskip = info->s_skip >> 2;
219 Uint32 *dstp = (Uint32 *)info->d_pixels;
220 int dstskip = info->d_skip >> 2;
221 Uint8 load[8];
222
223 *(Uint64 *)load = 0x00fefefe00fefefe;/* alpha128 mask */
224 movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
225 *(Uint64 *)load = 0x0001010100010101;/* !alpha128 mask */
226 movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
227 *(Uint64 *)load = 0xFF000000FF000000;/* dst alpha mask */
228 movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */
229 while(height--) {
230 DUFFS_LOOP_DOUBLE2(
231 {
232 Uint32 s = *srcp++;
233 Uint32 d = *dstp;
234 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
235 + (s & d & 0x00010101)) | 0xff000000;
236 },{
237 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
238 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
239
240 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
241 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
242
243 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
244 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
245 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
246 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
247
248 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
249 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
250 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
251 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
252 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
253 dstp += 2;
254 srcp += 2;
255 }, width);
256 srcp += srcskip;
257 dstp += dstskip;
258 }
259 emms();
260 }
261
262 /* fast RGB888->(A)RGB888 blending with surface alpha */
263 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
264 {
265 unsigned alpha = info->src->alpha;
266 if(alpha == 128) {
267 BlitRGBtoRGBSurfaceAlpha128MMX(info);
268 } else {
269 int width = info->d_width;
270 int height = info->d_height;
271 Uint32 *srcp = (Uint32 *)info->s_pixels;
272 int srcskip = info->s_skip >> 2;
273 Uint32 *dstp = (Uint32 *)info->d_pixels;
274 int dstskip = info->d_skip >> 2;
275 Uint8 load[8] = {alpha, alpha, alpha, alpha,
276 alpha, alpha, alpha, alpha};
277
278 movq_m2r(*load, mm4); /* alpha -> mm4 */
279 *(Uint64 *)load = 0x00FF00FF00FF00FF;
280 movq_m2r(*load, mm3); /* mask -> mm3 */
281 pand_r2r(mm3, mm4); /* mm4 & mask -> 0A0A0A0A -> mm4 */
282 *(Uint64 *)load = 0xFF000000FF000000;/* dst alpha mask */
283 movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */
284
285 while(height--) {
286 DUFFS_LOOP_DOUBLE2({
287 /* One Pixel Blend */
288 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
289 punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */
290 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */
291
292 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
293 movq_r2r(mm2, mm6);/* dst(ARGB) -> mm6 (0000ARGB)*/
294 punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */
295 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
296
297 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
298 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
299 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
300 paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
301 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
302 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */
303 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
304 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
305 ++srcp;
306 ++dstp;
307 },{
308 /* Two Pixels Blend */
309 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
310 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
311 punpcklbw_r2r(mm0, mm0); /* low - AARRGGBB -> mm0 */
312 pand_r2r(mm3, mm0); /* 0A0R0G0B -> mm0(src1) */
313 punpckhbw_r2r(mm1, mm1); /* high - AARRGGBB -> mm1 */
314 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1(src2) */
315
316 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
317 movq_r2r(mm2, mm5); /* 2 x dst -> mm5(ARGBARGB) */
318 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
319 punpcklbw_r2r(mm2, mm2); /* low - AARRGGBB -> mm2 */
320 punpckhbw_r2r(mm6, mm6); /* high - AARRGGBB -> mm6 */
321 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2(dst1) */
322
323 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
324 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
325 pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6(dst2) */
326 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
327 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
328 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
329 paddw_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
330 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm0 */
331 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
332 paddw_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
333 pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6 */
334 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */
335 packuswb_r2r(mm6, mm6); /* ARGBARGB -> mm6 */
336 psrlq_i2r(32, mm2); /* mm2 >> 32 -> mm2 */
337 psllq_i2r(32, mm6); /* mm6 << 32 -> mm6 */
338 por_r2r(mm6, mm2); /* mm6 | mm2 -> mm2 */
339 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
340 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
341 srcp += 2;
342 dstp += 2;
343 }, width);
344 srcp += srcskip;
345 dstp += dstskip;
346 }
347 emms();
348 }
349 }
350
351 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
352 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
353 {
354 int width = info->d_width;
355 int height = info->d_height;
356 Uint32 *srcp = (Uint32 *)info->s_pixels;
357 int srcskip = info->s_skip >> 2;
358 Uint32 *dstp = (Uint32 *)info->d_pixels;
359 int dstskip = info->d_skip >> 2;
360 Uint32 alpha = 0;
361 Uint8 load[8];
362
363 *(Uint64 *)load = 0x00FF00FF00FF00FF;
364 movq_m2r(*load, mm3); /* mask -> mm2 */
365 *(Uint64 *)load = 0x00FF000000000000;
366 movq_m2r(*load, mm7); /* dst alpha mask -> mm2 */
367 *(Uint64 *)load = 0x00FFFFFF00FFFFFF;
368 movq_m2r(*load, mm0); /* alpha 255 mask -> mm0 */
369 *(Uint64 *)load = 0xFF000000FF000000;
370 movq_m2r(*load, mm6); /* alpha 255 !mask -> mm6 */
371 while(height--) {
372 DUFFS_LOOP4({
373 alpha = *srcp;
374 alpha >>= 24;
375 /* FIXME: Here we special-case opaque alpha since the
376 compositioning used (>>8 instead of /255) doesn't handle
377 it correctly. Also special-case alpha=0 for speed?
378 Benchmark this! */
379 if(alpha) {
380 if(alpha == SDL_ALPHA_OPAQUE) {
381 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
382 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
383 pand_r2r(mm0, mm1);
384 pand_r2r(mm6, mm2);
385 por_r2r(mm1, mm2);
386 movd_r2m(mm2, (*dstp));
387 } else {
388 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
389 punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */
390 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */
391
392 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
393 punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */
394 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
395
396 movq_r2r(mm2, mm5);/* mm2(0A0R0G0B) -> mm5 */
397 pand_r2r(mm7, mm5); /* mm5 & dst alpha mask -> mm5(0A000000) */
398 psrlq_i2r(24, mm5); /* mm5 >> 24 -> mm5 (0000A000)*/
399
400 movq_r2r(mm1, mm4);/* mm1(0A0R0G0B) -> mm4 */
401 psrlq_i2r(48, mm4); /* mm4 >> 48 -> mm4(0000000A) */
402 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
403 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
404
405 /* blend */
406 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
407 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
408 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
409 paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
410 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
411 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */
412 pand_r2r(mm0, mm2); /* 0RGB0RGB -> mm2 */
413 por_r2r(mm5, mm2); /* dst alpha | mm2 -> mm2 */
414 movd_r2m(mm2, *dstp);/* mm2 -> dst */
415 }
416 }
417 ++srcp;
418 ++dstp;
419 }, width);
420 srcp += srcskip;
421 dstp += dstskip;
422 }
423 emms();
424 }
425 #endif
426
198 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 427 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
199 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) 428 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
200 { 429 {
201 int width = info->d_width; 430 int width = info->d_width;
202 int height = info->d_height; 431 int height = info->d_height;
228 int height = info->d_height; 457 int height = info->d_height;
229 Uint32 *srcp = (Uint32 *)info->s_pixels; 458 Uint32 *srcp = (Uint32 *)info->s_pixels;
230 int srcskip = info->s_skip >> 2; 459 int srcskip = info->s_skip >> 2;
231 Uint32 *dstp = (Uint32 *)info->d_pixels; 460 Uint32 *dstp = (Uint32 *)info->d_pixels;
232 int dstskip = info->d_skip >> 2; 461 int dstskip = info->d_skip >> 2;
462 Uint32 s;
463 Uint32 d;
464 Uint32 s1;
465 Uint32 d1;
233 466
234 while(height--) { 467 while(height--) {
235 DUFFS_LOOP4({ 468 DUFFS_LOOP_DOUBLE2({
236 Uint32 s; 469 /* One Pixel Blend */
237 Uint32 d;
238 Uint32 s1;
239 Uint32 d1;
240 s = *srcp; 470 s = *srcp;
241 d = *dstp; 471 d = *dstp;
242 s1 = s & 0xff00ff; 472 s1 = s & 0xff00ff;
243 d1 = d & 0xff00ff; 473 d1 = d & 0xff00ff;
244 d1 = (d1 + ((s1 - d1) * alpha >> 8)) 474 d1 = (d1 + ((s1 - d1) * alpha >> 8))
247 d &= 0xff00; 477 d &= 0xff00;
248 d = (d + ((s - d) * alpha >> 8)) & 0xff00; 478 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
249 *dstp = d1 | d | 0xff000000; 479 *dstp = d1 | d | 0xff000000;
250 ++srcp; 480 ++srcp;
251 ++dstp; 481 ++dstp;
482 },{
483 /* Two Pixels Blend */
484 s = *srcp;
485 d = *dstp;
486 s1 = s & 0xff00ff;
487 d1 = d & 0xff00ff;
488 d1 += (s1 - d1) * alpha >> 8;
489 d1 &= 0xff00ff;
490
491 s = ((s & 0xff00) >> 8) |
492 ((srcp[1] & 0xff00) << 8);
493 d = ((d & 0xff00) >> 8) |
494 ((dstp[1] & 0xff00) << 8);
495 d += (s - d) * alpha >> 8;
496 d &= 0x00ff00ff;
497
498 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000;
499 ++srcp;
500
501 s1 = *srcp;
502 d1 = *dstp;
503 s1 &= 0xff00ff;
504 d1 &= 0xff00ff;
505 d1 += (s1 - d1) * alpha >> 8;
506 d1 &= 0xff00ff;
507
508 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000;
509 ++srcp;
510 ++dstp;
252 }, width); 511 }, width);
253 srcp += srcskip; 512 srcp += srcskip;
254 dstp += dstskip; 513 dstp += dstskip;
255 } 514 }
256 } 515 }
276 Uint32 alpha = s >> 24; 535 Uint32 alpha = s >> 24;
277 /* FIXME: Here we special-case opaque alpha since the 536 /* FIXME: Here we special-case opaque alpha since the
278 compositioning used (>>8 instead of /255) doesn't handle 537 compositioning used (>>8 instead of /255) doesn't handle
279 it correctly. Also special-case alpha=0 for speed? 538 it correctly. Also special-case alpha=0 for speed?
280 Benchmark this! */ 539 Benchmark this! */
281 if(alpha == SDL_ALPHA_OPAQUE) { 540 if(alpha) {
541 if(alpha == SDL_ALPHA_OPAQUE) {
282 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); 542 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
283 } else { 543 } else {
284 /* 544 /*
285 * take out the middle component (green), and process 545 * take out the middle component (green), and process
286 * the other two in parallel. One multiply less. 546 * the other two in parallel. One multiply less.
287 */ 547 */
288 d = *dstp; 548 d = *dstp;
292 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; 552 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
293 s &= 0xff00; 553 s &= 0xff00;
294 d &= 0xff00; 554 d &= 0xff00;
295 d = (d + ((s - d) * alpha >> 8)) & 0xff00; 555 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
296 *dstp = d1 | d | dalpha; 556 *dstp = d1 | d | dalpha;
557 }
297 } 558 }
298 ++srcp; 559 ++srcp;
299 ++dstp; 560 ++dstp;
300 }, width); 561 }, width);
301 srcp += srcskip; 562 srcp += srcskip;
302 dstp += dstskip; 563 dstp += dstskip;
303 } 564 }
304 } 565 }
566
567 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
568 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
569 inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
570 {
571 int width = info->d_width;
572 int height = info->d_height;
573 Uint32 *srcp = (Uint32 *)info->s_pixels;
574 int srcskip = info->s_skip >> 2;
575 Uint32 *dstp = (Uint32 *)info->d_pixels;
576 int dstskip = info->d_skip >> 2;
577
578 Uint32 s;
579 Uint32 alpha;
580
581 __asm__ (
582 /* make mm6 all zeros. */
583 "pxor %%mm6, %%mm6\n"
584
585 /* Make a mask to preserve the alpha. */
586 "pcmpeqb %%mm7, %%mm7\n\t" /* mm7(s) = FF FF FF FF | FF FF FF FF */
587 "psrlq $16, %%mm7\n\t" /* mm7(s) = 00 00 FF FF | FF FF FF FF */
588
589 : );
590
591 while(height--) {
592
593 DUFFS_LOOP4({
594
595 __asm__ (
596 "prefetch 64(%0)\n"
597 "prefetch 64(%1)\n"
598 : : "r" (srcp), "r" (dstp) );
599
600 s = *srcp;
601 alpha = s >> 24;
602 /* FIXME: Here we special-case opaque alpha since the
603 compositioning used (>>8 instead of /255) doesn't handle
604 it correctly. Also special-case alpha=0 for speed?
605 Benchmark this! */
606
607 if(alpha == SDL_ALPHA_OPAQUE) {
608 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000);
609 }
610
611 else {
612 __asm__ (
613 /* load in the source, and dst. */
614 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
615 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
616
617 /* Move the src alpha into mm2 */
618
619 /* if supporting pshufw */
620 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
621 /*"psrlw $8, %%mm2\n" */
622
623 /* else: */
624 "movq %%mm0, %%mm2\n"
625 "psrld $24, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
626 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
627 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
628
629 /* move the colors into words. */
630 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
631 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
632
633 /* src - dst */
634 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
635
636 /* A * (src-dst) */
637 "pmullw %%mm2, %%mm0\n" /* mm0 = As*As-d As*Rs-d | As*Gs-d As*Bs-d */
638 "pand %%mm7, %%mm0\n" /* to preserve dest alpha */
639 "psrlw $8, %%mm0\n" /* mm0 = Ac>>8 Rc>>8 | Gc>>8 Bc>>8 */
640 "paddb %%mm1, %%mm0\n" /* mm0 = Ac+Ad Rc+Rd | Gc+Gd Bc+Bd */
641
642 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
643
644 "movd %%mm0, (%1)\n" /* result in mm0 */
645
646 : : "r" (srcp), "r" (dstp) );
647
648 }
649 ++srcp;
650 ++dstp;
651 }, width);
652 srcp += srcskip;
653 dstp += dstskip;
654 }
655
656 __asm__ (
657 "emms\n"
658 : );
659 }
660 #endif
305 661
306 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ 662 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
307 663
308 /* blend a single 16 bit pixel at 50% */ 664 /* blend a single 16 bit pixel at 50% */
309 #define BLEND16_50(d, s, mask) \ 665 #define BLEND16_50(d, s, mask) \
407 srcp += srcskip; 763 srcp += srcskip;
408 dstp += dstskip; 764 dstp += dstskip;
409 } 765 }
410 } 766 }
411 } 767 }
768
769 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
770 /* fast RGB565->RGB565 blending with surface alpha */
771 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
772 {
773 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
774 if(alpha == 128) {
775 Blit16to16SurfaceAlpha128(info, 0xf7de);
776 } else {
777 int width = info->d_width;
778 int height = info->d_height;
779 Uint16 *srcp = (Uint16 *)info->s_pixels;
780 int srcskip = info->s_skip >> 1;
781 Uint16 *dstp = (Uint16 *)info->d_pixels;
782 int dstskip = info->d_skip >> 1;
783 Uint32 s, d;
784 Uint8 load[8];
785
786 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
787 *(Uint64 *)load = alpha;
788 alpha >>= 3; /* downscale alpha to 5 bits */
789
790 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
791 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
792 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
793
794 /* Setup the 565 color channel masks */
795 *(Uint64 *)load = 0xF800F800F800F800;
796 movq_m2r(*load, mm1); /* MASKRED -> mm1 */
797 *(Uint64 *)load = 0x07E007E007E007E0;
798 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
799 *(Uint64 *)load = 0x001F001F001F001F;
800 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
801 while(height--) {
802 DUFFS_LOOP_QUATRO2(
803 {
804 s = *srcp++;
805 d = *dstp;
806 /*
807 * shift out the middle component (green) to
808 * the high 16 bits, and process all three RGB
809 * components at the same time.
810 */
811 s = (s | s << 16) & 0x07e0f81f;
812 d = (d | d << 16) & 0x07e0f81f;
813 d += (s - d) * alpha >> 5;
814 d &= 0x07e0f81f;
815 *dstp++ = d | d >> 16;
816 },{
817 s = *srcp++;
818 d = *dstp;
819 /*
820 * shift out the middle component (green) to
821 * the high 16 bits, and process all three RGB
822 * components at the same time.
823 */
824 s = (s | s << 16) & 0x07e0f81f;
825 d = (d | d << 16) & 0x07e0f81f;
826 d += (s - d) * alpha >> 5;
827 d &= 0x07e0f81f;
828 *dstp++ = d | d >> 16;
829 s = *srcp++;
830 d = *dstp;
831 /*
832 * shift out the middle component (green) to
833 * the high 16 bits, and process all three RGB
834 * components at the same time.
835 */
836 s = (s | s << 16) & 0x07e0f81f;
837 d = (d | d << 16) & 0x07e0f81f;
838 d += (s - d) * alpha >> 5;
839 d &= 0x07e0f81f;
840 *dstp++ = d | d >> 16;
841 },{
842 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
843 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
844
845 /* RED */
846 movq_r2r(mm2, mm5); /* src -> mm5 */
847 pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */
848 psrlq_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
849
850 movq_r2r(mm3, mm6); /* dst -> mm6 */
851 pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */
852 psrlq_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
853
854 /* blend */
855 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
856 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
857 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
858 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
859 psllq_i2r(11, mm6); /* mm6 << 11 -> mm6 */
860 pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */
861
862 movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */
863 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */
864 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */
865 por_r2r(mm6, mm3); /* save new reds in dsts */
866
867 /* green */
868 movq_r2r(mm2, mm5); /* src -> mm5 */
869 pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */
870 psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */
871
872 movq_r2r(mm3, mm6); /* dst -> mm6 */
873 pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */
874 psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */
875
876 /* blend */
877 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
878 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
879 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
880 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
881 psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */
882 pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */
883
884 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
885 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */
886 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */
887 por_r2r(mm6, mm3); /* save new greens in dsts */
888
889 /* blue */
890 movq_r2r(mm2, mm5); /* src -> mm5 */
891 pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */
892
893 movq_r2r(mm3, mm6); /* dst -> mm6 */
894 pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
895
896 /* blend */
897 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
898 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
899 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
900 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
901 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */
902
903 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
904 por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */
905 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */
906 por_r2r(mm6, mm3); /* save new blues in dsts */
907
908 movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
909
910 srcp += 4;
911 dstp += 4;
912 }, width);
913 srcp += srcskip;
914 dstp += dstskip;
915 }
916 emms();
917 }
918 }
919
920 /* fast RGB555->RGB555 blending with surface alpha */
921 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
922 {
923 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
924 if(alpha == 128) {
925 Blit16to16SurfaceAlpha128(info, 0xfbde);
926 } else {
927 int width = info->d_width;
928 int height = info->d_height;
929 Uint16 *srcp = (Uint16 *)info->s_pixels;
930 int srcskip = info->s_skip >> 1;
931 Uint16 *dstp = (Uint16 *)info->d_pixels;
932 int dstskip = info->d_skip >> 1;
933 Uint32 s, d;
934 Uint8 load[8];
935
936 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
937 *(Uint64 *)load = alpha;
938 alpha >>= 3; /* downscale alpha to 5 bits */
939
940 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
941 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
942 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
943
944 /* Setup the 555 color channel masks */
945 *(Uint64 *)load = 0x7C007C007C007C00;
946 movq_m2r(*load, mm1); /* MASKRED -> mm1 */
947 *(Uint64 *)load = 0x03E003E003E003E0;
948 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
949 *(Uint64 *)load = 0x001F001F001F001F;
950 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
951 while(height--) {
952 DUFFS_LOOP_QUATRO2(
953 {
954 s = *srcp++;
955 d = *dstp;
956 /*
957 * shift out the middle component (green) to
958 * the high 16 bits, and process all three RGB
959 * components at the same time.
960 */
961 s = (s | s << 16) & 0x03e07c1f;
962 d = (d | d << 16) & 0x03e07c1f;
963 d += (s - d) * alpha >> 5;
964 d &= 0x03e07c1f;
965 *dstp++ = d | d >> 16;
966 },{
967 s = *srcp++;
968 d = *dstp;
969 /*
970 * shift out the middle component (green) to
971 * the high 16 bits, and process all three RGB
972 * components at the same time.
973 */
974 s = (s | s << 16) & 0x03e07c1f;
975 d = (d | d << 16) & 0x03e07c1f;
976 d += (s - d) * alpha >> 5;
977 d &= 0x03e07c1f;
978 *dstp++ = d | d >> 16;
979 s = *srcp++;
980 d = *dstp;
981 /*
982 * shift out the middle component (green) to
983 * the high 16 bits, and process all three RGB
984 * components at the same time.
985 */
986 s = (s | s << 16) & 0x03e07c1f;
987 d = (d | d << 16) & 0x03e07c1f;
988 d += (s - d) * alpha >> 5;
989 d &= 0x03e07c1f;
990 *dstp++ = d | d >> 16;
991 },{
992 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
993 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
994
995 /* RED */
996 movq_r2r(mm2, mm5); /* src -> mm5 */
997 pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */
998 psrlq_i2r(10, mm5); /* mm5 >> 10 -> mm5 [000r 000r 000r 000r] */
999
1000 movq_r2r(mm3, mm6); /* dst -> mm6 */
1001 pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */
1002 psrlq_i2r(10, mm6); /* mm6 >> 10 -> mm6 [000r 000r 000r 000r] */
1003
1004 /* blend */
1005 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1006 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1007 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
1008 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1009 psllq_i2r(10, mm6); /* mm6 << 10 -> mm6 */
1010 pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */
1011
1012 movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */
1013 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */
1014 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */
1015 por_r2r(mm6, mm3); /* save new reds in dsts */
1016
1017 /* green */
1018 movq_r2r(mm2, mm5); /* src -> mm5 */
1019 pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */
1020 psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */
1021
1022 movq_r2r(mm3, mm6); /* dst -> mm6 */
1023 pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */
1024 psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */
1025
1026 /* blend */
1027 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1028 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1029 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
1030 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1031 psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */
1032 pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */
1033
1034 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
1035 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */
1036 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */
1037 por_r2r(mm6, mm3); /* save new greens in dsts */
1038
1039 /* blue */
1040 movq_r2r(mm2, mm5); /* src -> mm5 */
1041 pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */
1042
1043 movq_r2r(mm3, mm6); /* dst -> mm6 */
1044 pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
1045
1046 /* blend */
1047 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1048 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1049 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */
1050 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1051 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */
1052
1053 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
1054 por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */
1055 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */
1056 por_r2r(mm6, mm3); /* save new blues in dsts */
1057
1058 movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
1059
1060 srcp += 4;
1061 dstp += 4;
1062 }, width);
1063 srcp += srcskip;
1064 dstp += dstskip;
1065 }
1066 emms();
1067 }
1068 }
1069 #endif
412 1070
413 /* fast RGB565->RGB565 blending with surface alpha */ 1071 /* fast RGB565->RGB565 blending with surface alpha */
414 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) 1072 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
415 { 1073 {
416 unsigned alpha = info->src->alpha; 1074 unsigned alpha = info->src->alpha;
498 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */ 1156 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
499 /* FIXME: Here we special-case opaque alpha since the 1157 /* FIXME: Here we special-case opaque alpha since the
500 compositioning used (>>8 instead of /255) doesn't handle 1158 compositioning used (>>8 instead of /255) doesn't handle
501 it correctly. Also special-case alpha=0 for speed? 1159 it correctly. Also special-case alpha=0 for speed?
502 Benchmark this! */ 1160 Benchmark this! */
503 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { 1161 if(alpha) {
1162 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
504 *dstp = (s >> 8 & 0xf800) + (s >> 5 & 0x7e0) 1163 *dstp = (s >> 8 & 0xf800) + (s >> 5 & 0x7e0)
505 + (s >> 3 & 0x1f); 1164 + (s >> 3 & 0x1f);
506 } else { 1165 } else {
507 Uint32 d = *dstp; 1166 Uint32 d = *dstp;
508 /* 1167 /*
509 * convert source and destination to G0RAB65565 1168 * convert source and destination to G0RAB65565
510 * and blend all components at the same time 1169 * and blend all components at the same time
511 */ 1170 */
513 + (s >> 3 & 0x1f); 1172 + (s >> 3 & 0x1f);
514 d = (d | d << 16) & 0x07e0f81f; 1173 d = (d | d << 16) & 0x07e0f81f;
515 d += (s - d) * alpha >> 5; 1174 d += (s - d) * alpha >> 5;
516 d &= 0x07e0f81f; 1175 d &= 0x07e0f81f;
517 *dstp = d | d >> 16; 1176 *dstp = d | d >> 16;
1177 }
518 } 1178 }
519 srcp++; 1179 srcp++;
520 dstp++; 1180 dstp++;
521 }, width); 1181 }, width);
522 srcp += srcskip; 1182 srcp += srcskip;
541 alpha = s >> 27; /* downscale alpha to 5 bits */ 1201 alpha = s >> 27; /* downscale alpha to 5 bits */
542 /* FIXME: Here we special-case opaque alpha since the 1202 /* FIXME: Here we special-case opaque alpha since the
543 compositioning used (>>8 instead of /255) doesn't handle 1203 compositioning used (>>8 instead of /255) doesn't handle
544 it correctly. Also special-case alpha=0 for speed? 1204 it correctly. Also special-case alpha=0 for speed?
545 Benchmark this! */ 1205 Benchmark this! */
546 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { 1206 if(alpha) {
1207 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
547 *dstp = (s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) 1208 *dstp = (s >> 9 & 0x7c00) + (s >> 6 & 0x3e0)
548 + (s >> 3 & 0x1f); 1209 + (s >> 3 & 0x1f);
549 } else { 1210 } else {
550 Uint32 d = *dstp; 1211 Uint32 d = *dstp;
551 /* 1212 /*
552 * convert source and destination to G0RAB65565 1213 * convert source and destination to G0RAB65565
553 * and blend all components at the same time 1214 * and blend all components at the same time
554 */ 1215 */
556 + (s >> 3 & 0x1f); 1217 + (s >> 3 & 0x1f);
557 d = (d | d << 16) & 0x03e07c1f; 1218 d = (d | d << 16) & 0x03e07c1f;
558 d += (s - d) * alpha >> 5; 1219 d += (s - d) * alpha >> 5;
559 d &= 0x03e07c1f; 1220 d &= 0x03e07c1f;
560 *dstp = d | d >> 16; 1221 *dstp = d | d >> 16;
1222 }
561 } 1223 }
562 srcp++; 1224 srcp++;
563 dstp++; 1225 dstp++;
564 }, width); 1226 }, width);
565 srcp += srcskip; 1227 srcp += srcskip;
581 int srcbpp = srcfmt->BytesPerPixel; 1243 int srcbpp = srcfmt->BytesPerPixel;
582 int dstbpp = dstfmt->BytesPerPixel; 1244 int dstbpp = dstfmt->BytesPerPixel;
583 unsigned sA = srcfmt->alpha; 1245 unsigned sA = srcfmt->alpha;
584 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; 1246 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
585 1247
586 while ( height-- ) { 1248 if(sA) {
1249 while ( height-- ) {
587 DUFFS_LOOP4( 1250 DUFFS_LOOP4(
588 { 1251 {
589 Uint32 pixel; 1252 Uint32 pixel;
590 unsigned sR; 1253 unsigned sR;
591 unsigned sG; 1254 unsigned sG;
601 dst += dstbpp; 1264 dst += dstbpp;
602 }, 1265 },
603 width); 1266 width);
604 src += srcskip; 1267 src += srcskip;
605 dst += dstskip; 1268 dst += dstskip;
1269 }
606 } 1270 }
607 } 1271 }
608 1272
609 /* General (slow) colorkeyed N->N blending with per-surface alpha */ 1273 /* General (slow) colorkeyed N->N blending with per-surface alpha */
610 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) 1274 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info)
632 unsigned sB; 1296 unsigned sB;
633 unsigned dR; 1297 unsigned dR;
634 unsigned dG; 1298 unsigned dG;
635 unsigned dB; 1299 unsigned dB;
636 RETRIEVE_RGB_PIXEL(src, srcbpp, pixel); 1300 RETRIEVE_RGB_PIXEL(src, srcbpp, pixel);
637 if(pixel != ckey) { 1301 if(sA && pixel != ckey) {
638 RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); 1302 RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB);
639 DISEMBLE_RGB(dst, dstbpp, dstfmt, pixel, dR, dG, dB); 1303 DISEMBLE_RGB(dst, dstbpp, dstfmt, pixel, dR, dG, dB);
640 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 1304 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
641 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 1305 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
642 } 1306 }
684 unsigned dG; 1348 unsigned dG;
685 unsigned dB; 1349 unsigned dB;
686 unsigned sA; 1350 unsigned sA;
687 unsigned dA; 1351 unsigned dA;
688 DISEMBLE_RGBA(src, srcbpp, srcfmt, pixel, sR, sG, sB, sA); 1352 DISEMBLE_RGBA(src, srcbpp, srcfmt, pixel, sR, sG, sB, sA);
689 DISEMBLE_RGBA(dst, dstbpp, dstfmt, pixel, dR, dG, dB, dA); 1353 if(sA) {
690 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); 1354 DISEMBLE_RGBA(dst, dstbpp, dstfmt, pixel, dR, dG, dB, dA);
691 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); 1355 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB);
1356 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1357 }
692 src += srcbpp; 1358 src += srcbpp;
693 dst += dstbpp; 1359 dst += dstbpp;
694 }, 1360 },
695 width); 1361 width);
696 src += srcskip; 1362 src += srcskip;
717 return BlitNto1SurfaceAlpha; 1383 return BlitNto1SurfaceAlpha;
718 1384
719 case 2: 1385 case 2:
720 if(surface->map->identity) { 1386 if(surface->map->identity) {
721 if(df->Gmask == 0x7e0) 1387 if(df->Gmask == 0x7e0)
1388 {
1389 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
1390 if((CPU_Flags()&MMX_CPU)!=0)
1391 return Blit565to565SurfaceAlphaMMX;
1392 else
1393 #endif
722 return Blit565to565SurfaceAlpha; 1394 return Blit565to565SurfaceAlpha;
1395 }
723 else if(df->Gmask == 0x3e0) 1396 else if(df->Gmask == 0x3e0)
1397 {
1398 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
1399 if((CPU_Flags()&MMX_CPU)!=0)
1400 return Blit555to555SurfaceAlphaMMX;
1401 else
1402 #endif
724 return Blit555to555SurfaceAlpha; 1403 return Blit555to555SurfaceAlpha;
1404 }
725 } 1405 }
726 return BlitNtoNSurfaceAlpha; 1406 return BlitNtoNSurfaceAlpha;
727 1407
728 case 4: 1408 case 4:
729 if(sf->Rmask == df->Rmask 1409 if(sf->Rmask == df->Rmask
730 && sf->Gmask == df->Gmask 1410 && sf->Gmask == df->Gmask
731 && sf->Bmask == df->Bmask 1411 && sf->Bmask == df->Bmask
732 && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff 1412 && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff
733 && sf->BytesPerPixel == 4) 1413 && sf->BytesPerPixel == 4)
1414 {
1415 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
1416 if((CPU_Flags()&MMX_CPU)!=0)
1417 return BlitRGBtoRGBSurfaceAlphaMMX;
1418 else
1419 #endif
734 return BlitRGBtoRGBSurfaceAlpha; 1420 return BlitRGBtoRGBSurfaceAlpha;
1421 }
735 else 1422 else
736 return BlitNtoNSurfaceAlpha; 1423 return BlitNtoNSurfaceAlpha;
737 1424
738 case 3: 1425 case 3:
739 default: 1426 default:
762 if(sf->Amask == 0xff000000 1449 if(sf->Amask == 0xff000000
763 && sf->Rmask == df->Rmask 1450 && sf->Rmask == df->Rmask
764 && sf->Gmask == df->Gmask 1451 && sf->Gmask == df->Gmask
765 && sf->Bmask == df->Bmask 1452 && sf->Bmask == df->Bmask
766 && sf->BytesPerPixel == 4) 1453 && sf->BytesPerPixel == 4)
767 return BlitRGBtoRGBPixelAlpha; 1454 {
1455 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT)
1456 Uint32 f;
1457 f=CPU_Flags();
1458 if((f&(TDNOW_CPU|MMX_CPU))==(TDNOW_CPU|MMX_CPU))
1459 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1460 else
1461 if((f&MMX_CPU)!=0)
1462 return BlitRGBtoRGBPixelAlphaMMX;
1463 else
1464 #endif
1465 return BlitRGBtoRGBPixelAlpha;
1466 }
768 return BlitNtoNPixelAlpha; 1467 return BlitNtoNPixelAlpha;
769 1468
770 case 3: 1469 case 3:
771 default: 1470 default:
772 return BlitNtoNPixelAlpha; 1471 return BlitNtoNPixelAlpha;