Mercurial > sdl-ios-xcode
comparison src/video/SDL_blit_A.c @ 1047:ffaaf7ecf685
Altivec-optimized blitters!
Vast majority of this work is compliments of Bob Ippolito.
http://www.devolution.com/pipermail/sdl/2005-February/067466.html and many
other posts.
author | Ryan C. Gordon <icculus@icculus.org> |
---|---|
date | Sun, 17 Apr 2005 10:19:22 +0000 |
parents | 9ef41050100c |
children | 2651158f59b8 |
comparison
equal
deleted
inserted
replaced
1046:f09d5edfc7a3 | 1047:ffaaf7ecf685 |
---|---|
33 | 33 |
34 #if (defined(i386) || defined(__x86_64__)) && defined(__GNUC__) && defined(USE_ASMBLIT) | 34 #if (defined(i386) || defined(__x86_64__)) && defined(__GNUC__) && defined(USE_ASMBLIT) |
35 #define MMX_ASMBLIT | 35 #define MMX_ASMBLIT |
36 #endif | 36 #endif |
37 | 37 |
38 #ifdef MMX_ASMBLIT | |
39 /* Function to check the CPU flags */ | 38 /* Function to check the CPU flags */ |
40 #include "SDL_cpuinfo.h" | 39 #include "SDL_cpuinfo.h" |
40 #ifdef MMX_ASMBLIT | |
41 #include "mmx.h" | 41 #include "mmx.h" |
42 #endif | 42 #endif |
43 | 43 |
44 /* Functions to perform alpha blended blitting */ | 44 /* Functions to perform alpha blended blitting */ |
45 | 45 |
419 } | 419 } |
420 emms(); | 420 emms(); |
421 } | 421 } |
422 #endif | 422 #endif |
423 | 423 |
424 #ifdef USE_ALTIVEC_BLITTERS | |
425 #include <assert.h> | |
426 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F) | |
427 #define VECPRINT(msg, v) do { \ | |
428 vector unsigned int tmpvec = (vector unsigned int)(v); \ | |
429 unsigned int *vp = (unsigned int *)&tmpvec; \ | |
430 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \ | |
431 } while (0) | |
432 | |
433 /* the permuation vector that takes the high bytes out of all the appropriate shorts | |
434 (vector unsigned char)( | |
435 0x00, 0x10, 0x02, 0x12, | |
436 0x04, 0x14, 0x06, 0x16, | |
437 0x08, 0x18, 0x0A, 0x1A, | |
438 0x0C, 0x1C, 0x0E, 0x1E ); | |
439 */ | |
440 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F))) | |
441 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12))) | |
442 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24())) | |
443 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \ | |
444 ? vec_lvsl(0, src) \ | |
445 : vec_add(vec_lvsl(8, src), vec_splat_u8(8))) | |
446 | |
447 | |
448 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \ | |
449 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \ | |
450 vector unsigned short vtemp1 = vec_mule(vs, valpha); \ | |
451 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \ | |
452 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \ | |
453 /* valpha2 is 255-alpha */ \ | |
454 vector unsigned char valpha2 = vec_nor(valpha, valpha); \ | |
455 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \ | |
456 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \ | |
457 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \ | |
458 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \ | |
459 /* add source and dest */ \ | |
460 vtemp1 = vec_add(vtemp1, vtemp3); \ | |
461 vtemp2 = vec_add(vtemp2, vtemp4); \ | |
462 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \ | |
463 vtemp1 = vec_add(vtemp1, v1_16); \ | |
464 vtemp3 = vec_sr(vtemp1, v8_16); \ | |
465 vtemp1 = vec_add(vtemp1, vtemp3); \ | |
466 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \ | |
467 vtemp2 = vec_add(vtemp2, v1_16); \ | |
468 vtemp4 = vec_sr(vtemp2, v8_16); \ | |
469 vtemp2 = vec_add(vtemp2, vtemp4); \ | |
470 /* (>>8) and get ARGBARGBARGBARGB */ \ | |
471 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \ | |
472 } while (0) | |
473 | |
474 /* Calculate the permute vector used for 32->32 swizzling */ | |
475 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt, | |
476 const SDL_PixelFormat *dstfmt) | |
477 { | |
478 /* | |
479 * We have to assume that the bits that aren't used by other | |
480 * colors is alpha, and it's one complete byte, since some formats | |
481 * leave alpha with a zero mask, but we should still swizzle the bits. | |
482 */ | |
483 /* ARGB */ | |
484 const static struct SDL_PixelFormat default_pixel_format = { | |
485 NULL, 0, 0, | |
486 0, 0, 0, 0, | |
487 16, 8, 0, 24, | |
488 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000, | |
489 0, 0}; | |
490 if (!srcfmt) { | |
491 srcfmt = &default_pixel_format; | |
492 } | |
493 if (!dstfmt) { | |
494 dstfmt = &default_pixel_format; | |
495 } | |
496 vector unsigned char plus = (vector unsigned char) | |
497 ( 0x00, 0x00, 0x00, 0x00, | |
498 0x04, 0x04, 0x04, 0x04, | |
499 0x08, 0x08, 0x08, 0x08, | |
500 0x0C, 0x0C, 0x0C, 0x0C ); | |
501 vector unsigned char vswiz; | |
502 vector unsigned int srcvec; | |
503 #define RESHIFT(X) (3 - ((X) >> 3)) | |
504 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift); | |
505 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift); | |
506 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift); | |
507 Uint32 amask; | |
508 /* Use zero for alpha if either surface doesn't have alpha */ | |
509 if (dstfmt->Amask) { | |
510 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift); | |
511 } else { | |
512 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF); | |
513 } | |
514 #undef RESHIFT | |
515 ((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask); | |
516 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0)); | |
517 return(vswiz); | |
518 } | |
519 | |
520 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info) | |
521 { | |
522 int height = info->d_height; | |
523 Uint8 *src = (Uint8 *)info->s_pixels; | |
524 int srcskip = info->s_skip; | |
525 Uint8 *dst = (Uint8 *)info->d_pixels; | |
526 int dstskip = info->d_skip; | |
527 SDL_PixelFormat *srcfmt = info->src; | |
528 | |
529 vector unsigned char v0 = vec_splat_u8(0); | |
530 vector unsigned short v8_16 = vec_splat_u16(8); | |
531 vector unsigned short v1_16 = vec_splat_u16(1); | |
532 vector unsigned short v2_16 = vec_splat_u16(2); | |
533 vector unsigned short v3_16 = vec_splat_u16(3); | |
534 vector unsigned int v8_32 = vec_splat_u32(8); | |
535 vector unsigned int v16_32 = vec_add(v8_32, v8_32); | |
536 vector unsigned short v3f = (vector unsigned short)( | |
537 0x003f, 0x003f, 0x003f, 0x003f, | |
538 0x003f, 0x003f, 0x003f, 0x003f); | |
539 vector unsigned short vfc = (vector unsigned short)( | |
540 0x00fc, 0x00fc, 0x00fc, 0x00fc, | |
541 0x00fc, 0x00fc, 0x00fc, 0x00fc); | |
542 | |
543 /* | |
544 0x10 - 0x1f is the alpha | |
545 0x00 - 0x0e evens are the red | |
546 0x01 - 0x0f odds are zero | |
547 */ | |
548 vector unsigned char vredalpha1 = (vector unsigned char)( | |
549 0x10, 0x00, 0x01, 0x01, | |
550 0x10, 0x02, 0x01, 0x01, | |
551 0x10, 0x04, 0x01, 0x01, | |
552 0x10, 0x06, 0x01, 0x01 | |
553 ); | |
554 vector unsigned char vredalpha2 = (vector unsigned char)( | |
555 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32)) | |
556 ); | |
557 /* | |
558 0x00 - 0x0f is ARxx ARxx ARxx ARxx | |
559 0x11 - 0x0f odds are blue | |
560 */ | |
561 vector unsigned char vblue1 = (vector unsigned char)( | |
562 0x00, 0x01, 0x02, 0x11, | |
563 0x04, 0x05, 0x06, 0x13, | |
564 0x08, 0x09, 0x0a, 0x15, | |
565 0x0c, 0x0d, 0x0e, 0x17 | |
566 ); | |
567 vector unsigned char vblue2 = (vector unsigned char)( | |
568 vec_add((vector unsigned int)vblue1, v8_32) | |
569 ); | |
570 /* | |
571 0x00 - 0x0f is ARxB ARxB ARxB ARxB | |
572 0x10 - 0x0e evens are green | |
573 */ | |
574 vector unsigned char vgreen1 = (vector unsigned char)( | |
575 0x00, 0x01, 0x10, 0x03, | |
576 0x04, 0x05, 0x12, 0x07, | |
577 0x08, 0x09, 0x14, 0x0b, | |
578 0x0c, 0x0d, 0x16, 0x0f | |
579 ); | |
580 vector unsigned char vgreen2 = (vector unsigned char)( | |
581 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32)) | |
582 ); | |
583 vector unsigned char vgmerge = (vector unsigned char)( | |
584 0x00, 0x02, 0x00, 0x06, | |
585 0x00, 0x0a, 0x00, 0x0e, | |
586 0x00, 0x12, 0x00, 0x16, | |
587 0x00, 0x1a, 0x00, 0x1e); | |
588 vector unsigned char mergePermute = VEC_MERGE_PERMUTE(); | |
589 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL); | |
590 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); | |
591 | |
592 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7); | |
593 vf800 = vec_sl(vf800, vec_splat_u16(8)); | |
594 | |
595 while(height--) { | |
596 int extrawidth; | |
597 vector unsigned char valigner; | |
598 vector unsigned char vsrc; | |
599 vector unsigned char voverflow; | |
600 int width = info->d_width; | |
601 | |
602 #define ONE_PIXEL_BLEND(condition, widthvar) \ | |
603 while (condition) { \ | |
604 Uint32 pixel; \ | |
605 unsigned sR, sG, sB, dR, dG, dB, sA; \ | |
606 DISEMBLE_RGBA(src, 4, srcfmt, pixel, sR, sG, sB, sA); \ | |
607 if(sA) { \ | |
608 unsigned short dstpixel = *((unsigned short *)dst); \ | |
609 dR = (dstpixel >> 8) & 0xf8; \ | |
610 dG = (dstpixel >> 3) & 0xfc; \ | |
611 dB = (dstpixel << 3) & 0xf8; \ | |
612 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ | |
613 *((unsigned short *)dst) = ( \ | |
614 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \ | |
615 ); \ | |
616 } \ | |
617 src += 4; \ | |
618 dst += 2; \ | |
619 widthvar--; \ | |
620 } | |
621 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width); | |
622 extrawidth = (width % 8); | |
623 valigner = VEC_ALIGNER(src); | |
624 vsrc = (vector unsigned char)vec_ld(0, src); | |
625 width -= extrawidth; | |
626 while (width) { | |
627 vector unsigned char valpha; | |
628 vector unsigned char vsrc1, vsrc2; | |
629 vector unsigned char vdst1, vdst2; | |
630 vector unsigned short vR, vG, vB; | |
631 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel; | |
632 | |
633 /* Load 8 pixels from src as ARGB */ | |
634 voverflow = (vector unsigned char)vec_ld(15, src); | |
635 vsrc = vec_perm(vsrc, voverflow, valigner); | |
636 vsrc1 = vec_perm(vsrc, vsrc, vpermute); | |
637 src += 16; | |
638 vsrc = (vector unsigned char)vec_ld(15, src); | |
639 voverflow = vec_perm(voverflow, vsrc, valigner); | |
640 vsrc2 = vec_perm(voverflow, voverflow, vpermute); | |
641 src += 16; | |
642 | |
643 /* Load 8 pixels from dst as XRGB */ | |
644 voverflow = vec_ld(0, dst); | |
645 vR = vec_and((vector unsigned short)voverflow, vf800); | |
646 vB = vec_sl((vector unsigned short)voverflow, v3_16); | |
647 vG = vec_sl(vB, v2_16); | |
648 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1); | |
649 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1); | |
650 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1); | |
651 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2); | |
652 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2); | |
653 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2); | |
654 | |
655 /* Alpha blend 8 pixels as ARGB */ | |
656 valpha = vec_perm(vsrc1, v0, valphaPermute); | |
657 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16); | |
658 valpha = vec_perm(vsrc2, v0, valphaPermute); | |
659 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16); | |
660 | |
661 /* Convert 8 pixels to 565 */ | |
662 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2); | |
663 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge); | |
664 vgpixel = vec_and(vgpixel, vfc); | |
665 vgpixel = vec_sl(vgpixel, v3_16); | |
666 vrpixel = vec_sl(vpixel, v1_16); | |
667 vrpixel = vec_and(vrpixel, vf800); | |
668 vbpixel = vec_and(vpixel, v3f); | |
669 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel); | |
670 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel); | |
671 | |
672 /* Store 8 pixels */ | |
673 vec_st(vdst1, 0, dst); | |
674 | |
675 width -= 8; | |
676 dst += 16; | |
677 } | |
678 ONE_PIXEL_BLEND((extrawidth), extrawidth); | |
679 #undef ONE_PIXEL_BLEND | |
680 src += srcskip; | |
681 dst += dstskip; | |
682 } | |
683 } | |
684 | |
685 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info) | |
686 { | |
687 unsigned alpha = info->src->alpha; | |
688 int height = info->d_height; | |
689 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
690 int srcskip = info->s_skip >> 2; | |
691 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
692 int dstskip = info->d_skip >> 2; | |
693 SDL_PixelFormat *srcfmt = info->src; | |
694 SDL_PixelFormat *dstfmt = info->dst; | |
695 unsigned sA = srcfmt->alpha; | |
696 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; | |
697 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask; | |
698 Uint32 ckey = info->src->colorkey; | |
699 vector unsigned char mergePermute; | |
700 vector unsigned char vsrcPermute; | |
701 vector unsigned char vdstPermute; | |
702 vector unsigned char vsdstPermute; | |
703 vector unsigned char valpha; | |
704 vector unsigned char valphamask; | |
705 vector unsigned char vbits; | |
706 vector unsigned char v0; | |
707 vector unsigned short v1; | |
708 vector unsigned short v8; | |
709 vector unsigned int vckey; | |
710 vector unsigned int vrgbmask; | |
711 | |
712 mergePermute = VEC_MERGE_PERMUTE(); | |
713 v0 = vec_splat_u8(0); | |
714 v1 = vec_splat_u16(1); | |
715 v8 = vec_splat_u16(8); | |
716 | |
717 /* set the alpha to 255 on the destination surf */ | |
718 valphamask = VEC_ALPHA_MASK(); | |
719 | |
720 vsrcPermute = calc_swizzle32(srcfmt, NULL); | |
721 vdstPermute = calc_swizzle32(NULL, dstfmt); | |
722 vsdstPermute = calc_swizzle32(dstfmt, NULL); | |
723 | |
724 /* set a vector full of alpha and 255-alpha */ | |
725 ((unsigned char *)&valpha)[0] = alpha; | |
726 valpha = vec_splat(valpha, 0); | |
727 vbits = (vector unsigned char)vec_splat_s8(-1); | |
728 | |
729 ckey &= rgbmask; | |
730 ((unsigned int *)&vckey)[0] = ckey; | |
731 vckey = vec_splat(vckey, 0); | |
732 ((unsigned int *)&vrgbmask)[0] = rgbmask; | |
733 vrgbmask = vec_splat(vrgbmask, 0); | |
734 | |
735 while(height--) { | |
736 int width = info->d_width; | |
737 #define ONE_PIXEL_BLEND(condition, widthvar) \ | |
738 while (condition) { \ | |
739 Uint32 pixel; \ | |
740 unsigned sR, sG, sB, dR, dG, dB; \ | |
741 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, pixel); \ | |
742 if(sA && pixel != ckey) { \ | |
743 RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \ | |
744 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \ | |
745 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ | |
746 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ | |
747 } \ | |
748 ((Uint8 *)dstp) += 4; \ | |
749 ((Uint8 *)srcp) += 4; \ | |
750 widthvar--; \ | |
751 } | |
752 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | |
753 if (width > 0) { | |
754 int extrawidth = (width % 4); | |
755 vector unsigned char valigner = VEC_ALIGNER(srcp); | |
756 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | |
757 width -= extrawidth; | |
758 while (width) { | |
759 vector unsigned char vsel; | |
760 vector unsigned char voverflow; | |
761 vector unsigned char vd; | |
762 vector unsigned char vd_orig; | |
763 | |
764 /* s = *srcp */ | |
765 voverflow = (vector unsigned char)vec_ld(15, srcp); | |
766 vs = vec_perm(vs, voverflow, valigner); | |
767 | |
768 /* vsel is set for items that match the key */ | |
769 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask); | |
770 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey); | |
771 | |
772 /* permute to source format */ | |
773 vs = vec_perm(vs, valpha, vsrcPermute); | |
774 | |
775 /* d = *dstp */ | |
776 vd = (vector unsigned char)vec_ld(0, dstp); | |
777 vd_orig = vd = vec_perm(vd, v0, vsdstPermute); | |
778 | |
779 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | |
780 | |
781 /* set the alpha channel to full on */ | |
782 vd = vec_or(vd, valphamask); | |
783 | |
784 /* mask out color key */ | |
785 vd = vec_sel(vd, vd_orig, vsel); | |
786 | |
787 /* permute to dest format */ | |
788 vd = vec_perm(vd, vbits, vdstPermute); | |
789 | |
790 /* *dstp = res */ | |
791 vec_st((vector unsigned int)vd, 0, dstp); | |
792 | |
793 srcp += 4; | |
794 dstp += 4; | |
795 width -= 4; | |
796 vs = voverflow; | |
797 } | |
798 ONE_PIXEL_BLEND((extrawidth), extrawidth); | |
799 } | |
800 #undef ONE_PIXEL_BLEND | |
801 | |
802 srcp += srcskip; | |
803 dstp += dstskip; | |
804 } | |
805 } | |
806 | |
807 | |
808 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info) | |
809 { | |
810 int width = info->d_width; | |
811 int height = info->d_height; | |
812 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
813 int srcskip = info->s_skip >> 2; | |
814 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
815 int dstskip = info->d_skip >> 2; | |
816 SDL_PixelFormat *srcfmt = info->src; | |
817 SDL_PixelFormat *dstfmt = info->dst; | |
818 vector unsigned char mergePermute; | |
819 vector unsigned char valphaPermute; | |
820 vector unsigned char vsrcPermute; | |
821 vector unsigned char vdstPermute; | |
822 vector unsigned char vsdstPermute; | |
823 vector unsigned char valphamask; | |
824 vector unsigned char vpixelmask; | |
825 vector unsigned char v0; | |
826 vector unsigned short v1; | |
827 vector unsigned short v8; | |
828 | |
829 v0 = vec_splat_u8(0); | |
830 v1 = vec_splat_u16(1); | |
831 v8 = vec_splat_u16(8); | |
832 mergePermute = VEC_MERGE_PERMUTE(); | |
833 valphamask = VEC_ALPHA_MASK(); | |
834 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); | |
835 vpixelmask = vec_nor(valphamask, v0); | |
836 vsrcPermute = calc_swizzle32(srcfmt, NULL); | |
837 vdstPermute = calc_swizzle32(NULL, dstfmt); | |
838 vsdstPermute = calc_swizzle32(dstfmt, NULL); | |
839 | |
840 while ( height-- ) { | |
841 width = info->d_width; | |
842 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ | |
843 Uint32 pixel; \ | |
844 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \ | |
845 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, pixel, sR, sG, sB, sA); \ | |
846 if(sA) { \ | |
847 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, pixel, dR, dG, dB, dA); \ | |
848 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ | |
849 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \ | |
850 } \ | |
851 ++srcp; \ | |
852 ++dstp; \ | |
853 widthvar--; \ | |
854 } | |
855 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | |
856 if (width > 0) { | |
857 // vsrcPermute | |
858 // vdstPermute | |
859 int extrawidth = (width % 4); | |
860 vector unsigned char valigner = VEC_ALIGNER(srcp); | |
861 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | |
862 width -= extrawidth; | |
863 while (width) { | |
864 vector unsigned char voverflow; | |
865 vector unsigned char vd; | |
866 vector unsigned char valpha; | |
867 vector unsigned char vdstalpha; | |
868 /* s = *srcp */ | |
869 voverflow = (vector unsigned char)vec_ld(15, srcp); | |
870 vs = vec_perm(vs, voverflow, valigner); | |
871 vs = vec_perm(vs, v0, vsrcPermute); | |
872 | |
873 valpha = vec_perm(vs, v0, valphaPermute); | |
874 | |
875 /* d = *dstp */ | |
876 vd = (vector unsigned char)vec_ld(0, dstp); | |
877 vd = vec_perm(vd, v0, vsdstPermute); | |
878 vdstalpha = vec_and(vd, valphamask); | |
879 | |
880 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | |
881 | |
882 /* set the alpha to the dest alpha */ | |
883 vd = vec_and(vd, vpixelmask); | |
884 vd = vec_or(vd, vdstalpha); | |
885 vd = vec_perm(vd, v0, vdstPermute); | |
886 | |
887 /* *dstp = res */ | |
888 vec_st((vector unsigned int)vd, 0, dstp); | |
889 | |
890 srcp += 4; | |
891 dstp += 4; | |
892 width -= 4; | |
893 vs = voverflow; | |
894 | |
895 } | |
896 ONE_PIXEL_BLEND((extrawidth), extrawidth); | |
897 } | |
898 srcp += srcskip; | |
899 dstp += dstskip; | |
900 #undef ONE_PIXEL_BLEND | |
901 } | |
902 } | |
903 | |
904 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ | |
905 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info) | |
906 { | |
907 int width = info->d_width; | |
908 int height = info->d_height; | |
909 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
910 int srcskip = info->s_skip >> 2; | |
911 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
912 int dstskip = info->d_skip >> 2; | |
913 vector unsigned char mergePermute; | |
914 vector unsigned char valphaPermute; | |
915 vector unsigned char valphamask; | |
916 vector unsigned char vpixelmask; | |
917 vector unsigned char v0; | |
918 vector unsigned short v1; | |
919 vector unsigned short v8; | |
920 v0 = vec_splat_u8(0); | |
921 v1 = vec_splat_u16(1); | |
922 v8 = vec_splat_u16(8); | |
923 mergePermute = VEC_MERGE_PERMUTE(); | |
924 valphamask = VEC_ALPHA_MASK(); | |
925 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC)); | |
926 | |
927 | |
928 vpixelmask = vec_nor(valphamask, v0); | |
929 while(height--) { | |
930 width = info->d_width; | |
931 #define ONE_PIXEL_BLEND(condition, widthvar) \ | |
932 while ((condition)) { \ | |
933 Uint32 dalpha; \ | |
934 Uint32 d; \ | |
935 Uint32 s1; \ | |
936 Uint32 d1; \ | |
937 Uint32 s = *srcp; \ | |
938 Uint32 alpha = s >> 24; \ | |
939 if(alpha) { \ | |
940 if(alpha == SDL_ALPHA_OPAQUE) { \ | |
941 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \ | |
942 } else { \ | |
943 d = *dstp; \ | |
944 dalpha = d & 0xff000000; \ | |
945 s1 = s & 0xff00ff; \ | |
946 d1 = d & 0xff00ff; \ | |
947 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \ | |
948 s &= 0xff00; \ | |
949 d &= 0xff00; \ | |
950 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ | |
951 *dstp = d1 | d | dalpha; \ | |
952 } \ | |
953 } \ | |
954 ++srcp; \ | |
955 ++dstp; \ | |
956 widthvar--; \ | |
957 } | |
958 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | |
959 if (width > 0) { | |
960 int extrawidth = (width % 4); | |
961 vector unsigned char valigner = VEC_ALIGNER(srcp); | |
962 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | |
963 width -= extrawidth; | |
964 while (width) { | |
965 vector unsigned char voverflow; | |
966 vector unsigned char vd; | |
967 vector unsigned char valpha; | |
968 vector unsigned char vdstalpha; | |
969 /* s = *srcp */ | |
970 voverflow = (vector unsigned char)vec_ld(15, srcp); | |
971 vs = vec_perm(vs, voverflow, valigner); | |
972 | |
973 valpha = vec_perm(vs, v0, valphaPermute); | |
974 | |
975 /* d = *dstp */ | |
976 vd = (vector unsigned char)vec_ld(0, dstp); | |
977 vdstalpha = vec_and(vd, valphamask); | |
978 | |
979 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | |
980 | |
981 /* set the alpha to the dest alpha */ | |
982 vd = vec_and(vd, vpixelmask); | |
983 vd = vec_or(vd, vdstalpha); | |
984 | |
985 /* *dstp = res */ | |
986 vec_st((vector unsigned int)vd, 0, dstp); | |
987 | |
988 srcp += 4; | |
989 dstp += 4; | |
990 width -= 4; | |
991 vs = voverflow; | |
992 } | |
993 ONE_PIXEL_BLEND((extrawidth), extrawidth); | |
994 } | |
995 srcp += srcskip; | |
996 dstp += dstskip; | |
997 } | |
998 #undef ONE_PIXEL_BLEND | |
999 } | |
1000 | |
1001 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info) | |
1002 { | |
1003 /* XXX : 6 */ | |
1004 unsigned alpha = info->src->alpha; | |
1005 int height = info->d_height; | |
1006 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
1007 int srcskip = info->s_skip >> 2; | |
1008 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
1009 int dstskip = info->d_skip >> 2; | |
1010 SDL_PixelFormat *srcfmt = info->src; | |
1011 SDL_PixelFormat *dstfmt = info->dst; | |
1012 unsigned sA = srcfmt->alpha; | |
1013 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; | |
1014 vector unsigned char mergePermute; | |
1015 vector unsigned char vsrcPermute; | |
1016 vector unsigned char vdstPermute; | |
1017 vector unsigned char vsdstPermute; | |
1018 vector unsigned char valpha; | |
1019 vector unsigned char valphamask; | |
1020 vector unsigned char vbits; | |
1021 vector unsigned short v1; | |
1022 vector unsigned short v8; | |
1023 | |
1024 mergePermute = VEC_MERGE_PERMUTE(); | |
1025 v1 = vec_splat_u16(1); | |
1026 v8 = vec_splat_u16(8); | |
1027 | |
1028 /* set the alpha to 255 on the destination surf */ | |
1029 valphamask = VEC_ALPHA_MASK(); | |
1030 | |
1031 vsrcPermute = calc_swizzle32(srcfmt, NULL); | |
1032 vdstPermute = calc_swizzle32(NULL, dstfmt); | |
1033 vsdstPermute = calc_swizzle32(dstfmt, NULL); | |
1034 | |
1035 /* set a vector full of alpha and 255-alpha */ | |
1036 ((unsigned char *)&valpha)[0] = alpha; | |
1037 valpha = vec_splat(valpha, 0); | |
1038 vbits = (vector unsigned char)vec_splat_s8(-1); | |
1039 | |
1040 while(height--) { | |
1041 int width = info->d_width; | |
1042 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ | |
1043 Uint32 pixel; \ | |
1044 unsigned sR, sG, sB, dR, dG, dB; \ | |
1045 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, pixel, sR, sG, sB); \ | |
1046 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \ | |
1047 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \ | |
1048 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \ | |
1049 ++srcp; \ | |
1050 ++dstp; \ | |
1051 widthvar--; \ | |
1052 } | |
1053 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | |
1054 if (width > 0) { | |
1055 int extrawidth = (width % 4); | |
1056 vector unsigned char valigner = vec_lvsl(0, srcp); | |
1057 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | |
1058 width -= extrawidth; | |
1059 while (width) { | |
1060 vector unsigned char voverflow; | |
1061 vector unsigned char vd; | |
1062 | |
1063 /* s = *srcp */ | |
1064 voverflow = (vector unsigned char)vec_ld(15, srcp); | |
1065 vs = vec_perm(vs, voverflow, valigner); | |
1066 vs = vec_perm(vs, valpha, vsrcPermute); | |
1067 | |
1068 /* d = *dstp */ | |
1069 vd = (vector unsigned char)vec_ld(0, dstp); | |
1070 vd = vec_perm(vd, vd, vsdstPermute); | |
1071 | |
1072 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | |
1073 | |
1074 /* set the alpha channel to full on */ | |
1075 vd = vec_or(vd, valphamask); | |
1076 vd = vec_perm(vd, vbits, vdstPermute); | |
1077 | |
1078 /* *dstp = res */ | |
1079 vec_st((vector unsigned int)vd, 0, dstp); | |
1080 | |
1081 srcp += 4; | |
1082 dstp += 4; | |
1083 width -= 4; | |
1084 vs = voverflow; | |
1085 } | |
1086 ONE_PIXEL_BLEND((extrawidth), extrawidth); | |
1087 } | |
1088 #undef ONE_PIXEL_BLEND | |
1089 | |
1090 srcp += srcskip; | |
1091 dstp += dstskip; | |
1092 } | |
1093 | |
1094 } | |
1095 | |
1096 | |
1097 /* fast RGB888->(A)RGB888 blending */ | |
1098 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info) | |
1099 { | |
1100 unsigned alpha = info->src->alpha; | |
1101 int height = info->d_height; | |
1102 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
1103 int srcskip = info->s_skip >> 2; | |
1104 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
1105 int dstskip = info->d_skip >> 2; | |
1106 vector unsigned char mergePermute; | |
1107 vector unsigned char valpha; | |
1108 vector unsigned char valphamask; | |
1109 vector unsigned short v1; | |
1110 vector unsigned short v8; | |
1111 | |
1112 mergePermute = VEC_MERGE_PERMUTE(); | |
1113 v1 = vec_splat_u16(1); | |
1114 v8 = vec_splat_u16(8); | |
1115 | |
1116 /* set the alpha to 255 on the destination surf */ | |
1117 valphamask = VEC_ALPHA_MASK(); | |
1118 | |
1119 /* set a vector full of alpha and 255-alpha */ | |
1120 ((unsigned char *)&valpha)[0] = alpha; | |
1121 valpha = vec_splat(valpha, 0); | |
1122 | |
1123 while(height--) { | |
1124 int width = info->d_width; | |
1125 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \ | |
1126 Uint32 s = *srcp; \ | |
1127 Uint32 d = *dstp; \ | |
1128 Uint32 s1 = s & 0xff00ff; \ | |
1129 Uint32 d1 = d & 0xff00ff; \ | |
1130 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \ | |
1131 & 0xff00ff; \ | |
1132 s &= 0xff00; \ | |
1133 d &= 0xff00; \ | |
1134 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \ | |
1135 *dstp = d1 | d | 0xff000000; \ | |
1136 ++srcp; \ | |
1137 ++dstp; \ | |
1138 widthvar--; \ | |
1139 } | |
1140 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width); | |
1141 if (width > 0) { | |
1142 int extrawidth = (width % 4); | |
1143 vector unsigned char valigner = VEC_ALIGNER(srcp); | |
1144 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp); | |
1145 width -= extrawidth; | |
1146 while (width) { | |
1147 vector unsigned char voverflow; | |
1148 vector unsigned char vd; | |
1149 | |
1150 /* s = *srcp */ | |
1151 voverflow = (vector unsigned char)vec_ld(15, srcp); | |
1152 vs = vec_perm(vs, voverflow, valigner); | |
1153 | |
1154 /* d = *dstp */ | |
1155 vd = (vector unsigned char)vec_ld(0, dstp); | |
1156 | |
1157 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8); | |
1158 | |
1159 /* set the alpha channel to full on */ | |
1160 vd = vec_or(vd, valphamask); | |
1161 | |
1162 /* *dstp = res */ | |
1163 vec_st((vector unsigned int)vd, 0, dstp); | |
1164 | |
1165 srcp += 4; | |
1166 dstp += 4; | |
1167 width -= 4; | |
1168 vs = voverflow; | |
1169 } | |
1170 ONE_PIXEL_BLEND((extrawidth), extrawidth); | |
1171 } | |
1172 #undef ONE_PIXEL_BLEND | |
1173 | |
1174 srcp += srcskip; | |
1175 dstp += dstskip; | |
1176 } | |
1177 } | |
1178 #endif /* USE_ALTIVEC_BLITTERS */ | |
1179 | |
424 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ | 1180 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ |
425 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) | 1181 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) |
426 { | 1182 { |
427 int width = info->d_width; | 1183 int width = info->d_width; |
428 int height = info->d_height; | 1184 int height = info->d_height; |
1370 if(sf->Amask == 0) { | 2126 if(sf->Amask == 0) { |
1371 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) { | 2127 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) { |
1372 if(df->BytesPerPixel == 1) | 2128 if(df->BytesPerPixel == 1) |
1373 return BlitNto1SurfaceAlphaKey; | 2129 return BlitNto1SurfaceAlphaKey; |
1374 else | 2130 else |
1375 return BlitNtoNSurfaceAlphaKey; | 2131 #ifdef USE_ALTIVEC_BLITTERS |
2132 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && SDL_HasAltiVec()) | |
2133 return Blit32to32SurfaceAlphaKeyAltivec; | |
2134 else | |
2135 #endif | |
2136 return BlitNtoNSurfaceAlphaKey; | |
1376 } else { | 2137 } else { |
1377 /* Per-surface alpha blits */ | 2138 /* Per-surface alpha blits */ |
1378 switch(df->BytesPerPixel) { | 2139 switch(df->BytesPerPixel) { |
1379 case 1: | 2140 case 1: |
1380 return BlitNto1SurfaceAlpha; | 2141 return BlitNto1SurfaceAlpha; |
1412 #ifdef MMX_ASMBLIT | 2173 #ifdef MMX_ASMBLIT |
1413 if(SDL_HasMMX()) | 2174 if(SDL_HasMMX()) |
1414 return BlitRGBtoRGBSurfaceAlphaMMX; | 2175 return BlitRGBtoRGBSurfaceAlphaMMX; |
1415 else | 2176 else |
1416 #endif | 2177 #endif |
2178 #ifdef USE_ALTIVEC_BLITTERS | |
2179 if(SDL_HasAltiVec()) | |
2180 return BlitRGBtoRGBSurfaceAlphaAltivec; | |
2181 else | |
2182 #endif | |
1417 return BlitRGBtoRGBSurfaceAlpha; | 2183 return BlitRGBtoRGBSurfaceAlpha; |
1418 } | 2184 } |
1419 else | 2185 else |
2186 #ifdef USE_ALTIVEC_BLITTERS | |
2187 if((sf->BytesPerPixel == 4) && SDL_HasAltiVec()) | |
2188 return Blit32to32SurfaceAlphaAltivec; | |
2189 else | |
2190 #endif | |
1420 return BlitNtoNSurfaceAlpha; | 2191 return BlitNtoNSurfaceAlpha; |
1421 | 2192 |
1422 case 3: | 2193 case 3: |
1423 default: | 2194 default: |
1424 return BlitNtoNSurfaceAlpha; | 2195 return BlitNtoNSurfaceAlpha; |
1429 switch(df->BytesPerPixel) { | 2200 switch(df->BytesPerPixel) { |
1430 case 1: | 2201 case 1: |
1431 return BlitNto1PixelAlpha; | 2202 return BlitNto1PixelAlpha; |
1432 | 2203 |
1433 case 2: | 2204 case 2: |
2205 #ifdef USE_ALTIVEC_BLITTERS | |
2206 if(sf->BytesPerPixel == 4 && | |
2207 df->Gmask == 0x7e0 && | |
2208 df->Bmask == 0x1f) | |
2209 return Blit32to565PixelAlphaAltivec; | |
2210 else | |
2211 #endif | |
1434 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 | 2212 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 |
1435 && sf->Gmask == 0xff00 | 2213 && sf->Gmask == 0xff00 |
1436 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) | 2214 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) |
1437 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { | 2215 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { |
1438 if(df->Gmask == 0x7e0) | 2216 if(df->Gmask == 0x7e0) |
1455 else | 2233 else |
1456 if(SDL_HasMMX()) | 2234 if(SDL_HasMMX()) |
1457 return BlitRGBtoRGBPixelAlphaMMX; | 2235 return BlitRGBtoRGBPixelAlphaMMX; |
1458 else | 2236 else |
1459 #endif | 2237 #endif |
2238 #ifdef USE_ALTIVEC_BLITTERS | |
2239 if(SDL_HasAltiVec()) | |
2240 return BlitRGBtoRGBPixelAlphaAltivec; | |
2241 else | |
2242 #endif | |
1460 return BlitRGBtoRGBPixelAlpha; | 2243 return BlitRGBtoRGBPixelAlpha; |
1461 } | 2244 } |
2245 #ifdef USE_ALTIVEC_BLITTERS | |
2246 if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec()) | |
2247 return Blit32to32PixelAlphaAltivec; | |
2248 else | |
2249 #endif | |
1462 return BlitNtoNPixelAlpha; | 2250 return BlitNtoNPixelAlpha; |
1463 | 2251 |
1464 case 3: | 2252 case 3: |
1465 default: | 2253 default: |
1466 return BlitNtoNPixelAlpha; | 2254 return BlitNtoNPixelAlpha; |