comparison src/video/SDL_blit_A.c @ 1047:ffaaf7ecf685

Altivec-optimized blitters! Vast majority of this work is compliments of Bob Ippolito. http://www.devolution.com/pipermail/sdl/2005-February/067466.html and many other posts.
author Ryan C. Gordon <icculus@icculus.org>
date Sun, 17 Apr 2005 10:19:22 +0000
parents 9ef41050100c
children 2651158f59b8
comparison
equal deleted inserted replaced
1046:f09d5edfc7a3 1047:ffaaf7ecf685
33 33
34 #if (defined(i386) || defined(__x86_64__)) && defined(__GNUC__) && defined(USE_ASMBLIT) 34 #if (defined(i386) || defined(__x86_64__)) && defined(__GNUC__) && defined(USE_ASMBLIT)
35 #define MMX_ASMBLIT 35 #define MMX_ASMBLIT
36 #endif 36 #endif
37 37
38 #ifdef MMX_ASMBLIT
39 /* Function to check the CPU flags */ 38 /* Function to check the CPU flags */
40 #include "SDL_cpuinfo.h" 39 #include "SDL_cpuinfo.h"
40 #ifdef MMX_ASMBLIT
41 #include "mmx.h" 41 #include "mmx.h"
42 #endif 42 #endif
43 43
44 /* Functions to perform alpha blended blitting */ 44 /* Functions to perform alpha blended blitting */
45 45
419 } 419 }
420 emms(); 420 emms();
421 } 421 }
422 #endif 422 #endif
423 423
424 #ifdef USE_ALTIVEC_BLITTERS
425 #include <assert.h>
426 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
427 #define VECPRINT(msg, v) do { \
428 vector unsigned int tmpvec = (vector unsigned int)(v); \
429 unsigned int *vp = (unsigned int *)&tmpvec; \
430 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
431 } while (0)
432
433 /* the permuation vector that takes the high bytes out of all the appropriate shorts
434 (vector unsigned char)(
435 0x00, 0x10, 0x02, 0x12,
436 0x04, 0x14, 0x06, 0x16,
437 0x08, 0x18, 0x0A, 0x1A,
438 0x0C, 0x1C, 0x0E, 0x1E );
439 */
440 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
441 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
442 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
443 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
444 ? vec_lvsl(0, src) \
445 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
446
447
448 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
449 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
450 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
451 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
452 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
453 /* valpha2 is 255-alpha */ \
454 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
455 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
456 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
457 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
458 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
459 /* add source and dest */ \
460 vtemp1 = vec_add(vtemp1, vtemp3); \
461 vtemp2 = vec_add(vtemp2, vtemp4); \
462 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
463 vtemp1 = vec_add(vtemp1, v1_16); \
464 vtemp3 = vec_sr(vtemp1, v8_16); \
465 vtemp1 = vec_add(vtemp1, vtemp3); \
466 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
467 vtemp2 = vec_add(vtemp2, v1_16); \
468 vtemp4 = vec_sr(vtemp2, v8_16); \
469 vtemp2 = vec_add(vtemp2, vtemp4); \
470 /* (>>8) and get ARGBARGBARGBARGB */ \
471 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
472 } while (0)
473
474 /* Calculate the permute vector used for 32->32 swizzling */
475 static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
476 const SDL_PixelFormat *dstfmt)
477 {
478 /*
479 * We have to assume that the bits that aren't used by other
480 * colors is alpha, and it's one complete byte, since some formats
481 * leave alpha with a zero mask, but we should still swizzle the bits.
482 */
483 /* ARGB */
484 const static struct SDL_PixelFormat default_pixel_format = {
485 NULL, 0, 0,
486 0, 0, 0, 0,
487 16, 8, 0, 24,
488 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
489 0, 0};
490 if (!srcfmt) {
491 srcfmt = &default_pixel_format;
492 }
493 if (!dstfmt) {
494 dstfmt = &default_pixel_format;
495 }
496 vector unsigned char plus = (vector unsigned char)
497 ( 0x00, 0x00, 0x00, 0x00,
498 0x04, 0x04, 0x04, 0x04,
499 0x08, 0x08, 0x08, 0x08,
500 0x0C, 0x0C, 0x0C, 0x0C );
501 vector unsigned char vswiz;
502 vector unsigned int srcvec;
503 #define RESHIFT(X) (3 - ((X) >> 3))
504 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
505 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
506 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
507 Uint32 amask;
508 /* Use zero for alpha if either surface doesn't have alpha */
509 if (dstfmt->Amask) {
510 amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
511 } else {
512 amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
513 }
514 #undef RESHIFT
515 ((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask);
516 vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
517 return(vswiz);
518 }
519
520 static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
521 {
522 int height = info->d_height;
523 Uint8 *src = (Uint8 *)info->s_pixels;
524 int srcskip = info->s_skip;
525 Uint8 *dst = (Uint8 *)info->d_pixels;
526 int dstskip = info->d_skip;
527 SDL_PixelFormat *srcfmt = info->src;
528
529 vector unsigned char v0 = vec_splat_u8(0);
530 vector unsigned short v8_16 = vec_splat_u16(8);
531 vector unsigned short v1_16 = vec_splat_u16(1);
532 vector unsigned short v2_16 = vec_splat_u16(2);
533 vector unsigned short v3_16 = vec_splat_u16(3);
534 vector unsigned int v8_32 = vec_splat_u32(8);
535 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
536 vector unsigned short v3f = (vector unsigned short)(
537 0x003f, 0x003f, 0x003f, 0x003f,
538 0x003f, 0x003f, 0x003f, 0x003f);
539 vector unsigned short vfc = (vector unsigned short)(
540 0x00fc, 0x00fc, 0x00fc, 0x00fc,
541 0x00fc, 0x00fc, 0x00fc, 0x00fc);
542
543 /*
544 0x10 - 0x1f is the alpha
545 0x00 - 0x0e evens are the red
546 0x01 - 0x0f odds are zero
547 */
548 vector unsigned char vredalpha1 = (vector unsigned char)(
549 0x10, 0x00, 0x01, 0x01,
550 0x10, 0x02, 0x01, 0x01,
551 0x10, 0x04, 0x01, 0x01,
552 0x10, 0x06, 0x01, 0x01
553 );
554 vector unsigned char vredalpha2 = (vector unsigned char)(
555 vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
556 );
557 /*
558 0x00 - 0x0f is ARxx ARxx ARxx ARxx
559 0x11 - 0x0f odds are blue
560 */
561 vector unsigned char vblue1 = (vector unsigned char)(
562 0x00, 0x01, 0x02, 0x11,
563 0x04, 0x05, 0x06, 0x13,
564 0x08, 0x09, 0x0a, 0x15,
565 0x0c, 0x0d, 0x0e, 0x17
566 );
567 vector unsigned char vblue2 = (vector unsigned char)(
568 vec_add((vector unsigned int)vblue1, v8_32)
569 );
570 /*
571 0x00 - 0x0f is ARxB ARxB ARxB ARxB
572 0x10 - 0x0e evens are green
573 */
574 vector unsigned char vgreen1 = (vector unsigned char)(
575 0x00, 0x01, 0x10, 0x03,
576 0x04, 0x05, 0x12, 0x07,
577 0x08, 0x09, 0x14, 0x0b,
578 0x0c, 0x0d, 0x16, 0x0f
579 );
580 vector unsigned char vgreen2 = (vector unsigned char)(
581 vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
582 );
583 vector unsigned char vgmerge = (vector unsigned char)(
584 0x00, 0x02, 0x00, 0x06,
585 0x00, 0x0a, 0x00, 0x0e,
586 0x00, 0x12, 0x00, 0x16,
587 0x00, 0x1a, 0x00, 0x1e);
588 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
589 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
590 vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
591
592 vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
593 vf800 = vec_sl(vf800, vec_splat_u16(8));
594
595 while(height--) {
596 int extrawidth;
597 vector unsigned char valigner;
598 vector unsigned char vsrc;
599 vector unsigned char voverflow;
600 int width = info->d_width;
601
602 #define ONE_PIXEL_BLEND(condition, widthvar) \
603 while (condition) { \
604 Uint32 pixel; \
605 unsigned sR, sG, sB, dR, dG, dB, sA; \
606 DISEMBLE_RGBA(src, 4, srcfmt, pixel, sR, sG, sB, sA); \
607 if(sA) { \
608 unsigned short dstpixel = *((unsigned short *)dst); \
609 dR = (dstpixel >> 8) & 0xf8; \
610 dG = (dstpixel >> 3) & 0xfc; \
611 dB = (dstpixel << 3) & 0xf8; \
612 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
613 *((unsigned short *)dst) = ( \
614 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
615 ); \
616 } \
617 src += 4; \
618 dst += 2; \
619 widthvar--; \
620 }
621 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
622 extrawidth = (width % 8);
623 valigner = VEC_ALIGNER(src);
624 vsrc = (vector unsigned char)vec_ld(0, src);
625 width -= extrawidth;
626 while (width) {
627 vector unsigned char valpha;
628 vector unsigned char vsrc1, vsrc2;
629 vector unsigned char vdst1, vdst2;
630 vector unsigned short vR, vG, vB;
631 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
632
633 /* Load 8 pixels from src as ARGB */
634 voverflow = (vector unsigned char)vec_ld(15, src);
635 vsrc = vec_perm(vsrc, voverflow, valigner);
636 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
637 src += 16;
638 vsrc = (vector unsigned char)vec_ld(15, src);
639 voverflow = vec_perm(voverflow, vsrc, valigner);
640 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
641 src += 16;
642
643 /* Load 8 pixels from dst as XRGB */
644 voverflow = vec_ld(0, dst);
645 vR = vec_and((vector unsigned short)voverflow, vf800);
646 vB = vec_sl((vector unsigned short)voverflow, v3_16);
647 vG = vec_sl(vB, v2_16);
648 vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
649 vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
650 vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
651 vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
652 vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
653 vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
654
655 /* Alpha blend 8 pixels as ARGB */
656 valpha = vec_perm(vsrc1, v0, valphaPermute);
657 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
658 valpha = vec_perm(vsrc2, v0, valphaPermute);
659 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
660
661 /* Convert 8 pixels to 565 */
662 vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
663 vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
664 vgpixel = vec_and(vgpixel, vfc);
665 vgpixel = vec_sl(vgpixel, v3_16);
666 vrpixel = vec_sl(vpixel, v1_16);
667 vrpixel = vec_and(vrpixel, vf800);
668 vbpixel = vec_and(vpixel, v3f);
669 vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
670 vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
671
672 /* Store 8 pixels */
673 vec_st(vdst1, 0, dst);
674
675 width -= 8;
676 dst += 16;
677 }
678 ONE_PIXEL_BLEND((extrawidth), extrawidth);
679 #undef ONE_PIXEL_BLEND
680 src += srcskip;
681 dst += dstskip;
682 }
683 }
684
685 static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
686 {
687 unsigned alpha = info->src->alpha;
688 int height = info->d_height;
689 Uint32 *srcp = (Uint32 *)info->s_pixels;
690 int srcskip = info->s_skip >> 2;
691 Uint32 *dstp = (Uint32 *)info->d_pixels;
692 int dstskip = info->d_skip >> 2;
693 SDL_PixelFormat *srcfmt = info->src;
694 SDL_PixelFormat *dstfmt = info->dst;
695 unsigned sA = srcfmt->alpha;
696 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
697 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
698 Uint32 ckey = info->src->colorkey;
699 vector unsigned char mergePermute;
700 vector unsigned char vsrcPermute;
701 vector unsigned char vdstPermute;
702 vector unsigned char vsdstPermute;
703 vector unsigned char valpha;
704 vector unsigned char valphamask;
705 vector unsigned char vbits;
706 vector unsigned char v0;
707 vector unsigned short v1;
708 vector unsigned short v8;
709 vector unsigned int vckey;
710 vector unsigned int vrgbmask;
711
712 mergePermute = VEC_MERGE_PERMUTE();
713 v0 = vec_splat_u8(0);
714 v1 = vec_splat_u16(1);
715 v8 = vec_splat_u16(8);
716
717 /* set the alpha to 255 on the destination surf */
718 valphamask = VEC_ALPHA_MASK();
719
720 vsrcPermute = calc_swizzle32(srcfmt, NULL);
721 vdstPermute = calc_swizzle32(NULL, dstfmt);
722 vsdstPermute = calc_swizzle32(dstfmt, NULL);
723
724 /* set a vector full of alpha and 255-alpha */
725 ((unsigned char *)&valpha)[0] = alpha;
726 valpha = vec_splat(valpha, 0);
727 vbits = (vector unsigned char)vec_splat_s8(-1);
728
729 ckey &= rgbmask;
730 ((unsigned int *)&vckey)[0] = ckey;
731 vckey = vec_splat(vckey, 0);
732 ((unsigned int *)&vrgbmask)[0] = rgbmask;
733 vrgbmask = vec_splat(vrgbmask, 0);
734
735 while(height--) {
736 int width = info->d_width;
737 #define ONE_PIXEL_BLEND(condition, widthvar) \
738 while (condition) { \
739 Uint32 pixel; \
740 unsigned sR, sG, sB, dR, dG, dB; \
741 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, pixel); \
742 if(sA && pixel != ckey) { \
743 RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \
744 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \
745 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
746 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
747 } \
748 ((Uint8 *)dstp) += 4; \
749 ((Uint8 *)srcp) += 4; \
750 widthvar--; \
751 }
752 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
753 if (width > 0) {
754 int extrawidth = (width % 4);
755 vector unsigned char valigner = VEC_ALIGNER(srcp);
756 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
757 width -= extrawidth;
758 while (width) {
759 vector unsigned char vsel;
760 vector unsigned char voverflow;
761 vector unsigned char vd;
762 vector unsigned char vd_orig;
763
764 /* s = *srcp */
765 voverflow = (vector unsigned char)vec_ld(15, srcp);
766 vs = vec_perm(vs, voverflow, valigner);
767
768 /* vsel is set for items that match the key */
769 vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
770 vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
771
772 /* permute to source format */
773 vs = vec_perm(vs, valpha, vsrcPermute);
774
775 /* d = *dstp */
776 vd = (vector unsigned char)vec_ld(0, dstp);
777 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
778
779 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
780
781 /* set the alpha channel to full on */
782 vd = vec_or(vd, valphamask);
783
784 /* mask out color key */
785 vd = vec_sel(vd, vd_orig, vsel);
786
787 /* permute to dest format */
788 vd = vec_perm(vd, vbits, vdstPermute);
789
790 /* *dstp = res */
791 vec_st((vector unsigned int)vd, 0, dstp);
792
793 srcp += 4;
794 dstp += 4;
795 width -= 4;
796 vs = voverflow;
797 }
798 ONE_PIXEL_BLEND((extrawidth), extrawidth);
799 }
800 #undef ONE_PIXEL_BLEND
801
802 srcp += srcskip;
803 dstp += dstskip;
804 }
805 }
806
807
808 static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
809 {
810 int width = info->d_width;
811 int height = info->d_height;
812 Uint32 *srcp = (Uint32 *)info->s_pixels;
813 int srcskip = info->s_skip >> 2;
814 Uint32 *dstp = (Uint32 *)info->d_pixels;
815 int dstskip = info->d_skip >> 2;
816 SDL_PixelFormat *srcfmt = info->src;
817 SDL_PixelFormat *dstfmt = info->dst;
818 vector unsigned char mergePermute;
819 vector unsigned char valphaPermute;
820 vector unsigned char vsrcPermute;
821 vector unsigned char vdstPermute;
822 vector unsigned char vsdstPermute;
823 vector unsigned char valphamask;
824 vector unsigned char vpixelmask;
825 vector unsigned char v0;
826 vector unsigned short v1;
827 vector unsigned short v8;
828
829 v0 = vec_splat_u8(0);
830 v1 = vec_splat_u16(1);
831 v8 = vec_splat_u16(8);
832 mergePermute = VEC_MERGE_PERMUTE();
833 valphamask = VEC_ALPHA_MASK();
834 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
835 vpixelmask = vec_nor(valphamask, v0);
836 vsrcPermute = calc_swizzle32(srcfmt, NULL);
837 vdstPermute = calc_swizzle32(NULL, dstfmt);
838 vsdstPermute = calc_swizzle32(dstfmt, NULL);
839
840 while ( height-- ) {
841 width = info->d_width;
842 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
843 Uint32 pixel; \
844 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
845 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, pixel, sR, sG, sB, sA); \
846 if(sA) { \
847 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, pixel, dR, dG, dB, dA); \
848 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
849 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
850 } \
851 ++srcp; \
852 ++dstp; \
853 widthvar--; \
854 }
855 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
856 if (width > 0) {
857 // vsrcPermute
858 // vdstPermute
859 int extrawidth = (width % 4);
860 vector unsigned char valigner = VEC_ALIGNER(srcp);
861 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
862 width -= extrawidth;
863 while (width) {
864 vector unsigned char voverflow;
865 vector unsigned char vd;
866 vector unsigned char valpha;
867 vector unsigned char vdstalpha;
868 /* s = *srcp */
869 voverflow = (vector unsigned char)vec_ld(15, srcp);
870 vs = vec_perm(vs, voverflow, valigner);
871 vs = vec_perm(vs, v0, vsrcPermute);
872
873 valpha = vec_perm(vs, v0, valphaPermute);
874
875 /* d = *dstp */
876 vd = (vector unsigned char)vec_ld(0, dstp);
877 vd = vec_perm(vd, v0, vsdstPermute);
878 vdstalpha = vec_and(vd, valphamask);
879
880 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
881
882 /* set the alpha to the dest alpha */
883 vd = vec_and(vd, vpixelmask);
884 vd = vec_or(vd, vdstalpha);
885 vd = vec_perm(vd, v0, vdstPermute);
886
887 /* *dstp = res */
888 vec_st((vector unsigned int)vd, 0, dstp);
889
890 srcp += 4;
891 dstp += 4;
892 width -= 4;
893 vs = voverflow;
894
895 }
896 ONE_PIXEL_BLEND((extrawidth), extrawidth);
897 }
898 srcp += srcskip;
899 dstp += dstskip;
900 #undef ONE_PIXEL_BLEND
901 }
902 }
903
904 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
905 static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
906 {
907 int width = info->d_width;
908 int height = info->d_height;
909 Uint32 *srcp = (Uint32 *)info->s_pixels;
910 int srcskip = info->s_skip >> 2;
911 Uint32 *dstp = (Uint32 *)info->d_pixels;
912 int dstskip = info->d_skip >> 2;
913 vector unsigned char mergePermute;
914 vector unsigned char valphaPermute;
915 vector unsigned char valphamask;
916 vector unsigned char vpixelmask;
917 vector unsigned char v0;
918 vector unsigned short v1;
919 vector unsigned short v8;
920 v0 = vec_splat_u8(0);
921 v1 = vec_splat_u16(1);
922 v8 = vec_splat_u16(8);
923 mergePermute = VEC_MERGE_PERMUTE();
924 valphamask = VEC_ALPHA_MASK();
925 valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
926
927
928 vpixelmask = vec_nor(valphamask, v0);
929 while(height--) {
930 width = info->d_width;
931 #define ONE_PIXEL_BLEND(condition, widthvar) \
932 while ((condition)) { \
933 Uint32 dalpha; \
934 Uint32 d; \
935 Uint32 s1; \
936 Uint32 d1; \
937 Uint32 s = *srcp; \
938 Uint32 alpha = s >> 24; \
939 if(alpha) { \
940 if(alpha == SDL_ALPHA_OPAQUE) { \
941 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
942 } else { \
943 d = *dstp; \
944 dalpha = d & 0xff000000; \
945 s1 = s & 0xff00ff; \
946 d1 = d & 0xff00ff; \
947 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
948 s &= 0xff00; \
949 d &= 0xff00; \
950 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
951 *dstp = d1 | d | dalpha; \
952 } \
953 } \
954 ++srcp; \
955 ++dstp; \
956 widthvar--; \
957 }
958 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
959 if (width > 0) {
960 int extrawidth = (width % 4);
961 vector unsigned char valigner = VEC_ALIGNER(srcp);
962 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
963 width -= extrawidth;
964 while (width) {
965 vector unsigned char voverflow;
966 vector unsigned char vd;
967 vector unsigned char valpha;
968 vector unsigned char vdstalpha;
969 /* s = *srcp */
970 voverflow = (vector unsigned char)vec_ld(15, srcp);
971 vs = vec_perm(vs, voverflow, valigner);
972
973 valpha = vec_perm(vs, v0, valphaPermute);
974
975 /* d = *dstp */
976 vd = (vector unsigned char)vec_ld(0, dstp);
977 vdstalpha = vec_and(vd, valphamask);
978
979 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
980
981 /* set the alpha to the dest alpha */
982 vd = vec_and(vd, vpixelmask);
983 vd = vec_or(vd, vdstalpha);
984
985 /* *dstp = res */
986 vec_st((vector unsigned int)vd, 0, dstp);
987
988 srcp += 4;
989 dstp += 4;
990 width -= 4;
991 vs = voverflow;
992 }
993 ONE_PIXEL_BLEND((extrawidth), extrawidth);
994 }
995 srcp += srcskip;
996 dstp += dstskip;
997 }
998 #undef ONE_PIXEL_BLEND
999 }
1000
1001 static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
1002 {
1003 /* XXX : 6 */
1004 unsigned alpha = info->src->alpha;
1005 int height = info->d_height;
1006 Uint32 *srcp = (Uint32 *)info->s_pixels;
1007 int srcskip = info->s_skip >> 2;
1008 Uint32 *dstp = (Uint32 *)info->d_pixels;
1009 int dstskip = info->d_skip >> 2;
1010 SDL_PixelFormat *srcfmt = info->src;
1011 SDL_PixelFormat *dstfmt = info->dst;
1012 unsigned sA = srcfmt->alpha;
1013 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1014 vector unsigned char mergePermute;
1015 vector unsigned char vsrcPermute;
1016 vector unsigned char vdstPermute;
1017 vector unsigned char vsdstPermute;
1018 vector unsigned char valpha;
1019 vector unsigned char valphamask;
1020 vector unsigned char vbits;
1021 vector unsigned short v1;
1022 vector unsigned short v8;
1023
1024 mergePermute = VEC_MERGE_PERMUTE();
1025 v1 = vec_splat_u16(1);
1026 v8 = vec_splat_u16(8);
1027
1028 /* set the alpha to 255 on the destination surf */
1029 valphamask = VEC_ALPHA_MASK();
1030
1031 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1032 vdstPermute = calc_swizzle32(NULL, dstfmt);
1033 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1034
1035 /* set a vector full of alpha and 255-alpha */
1036 ((unsigned char *)&valpha)[0] = alpha;
1037 valpha = vec_splat(valpha, 0);
1038 vbits = (vector unsigned char)vec_splat_s8(-1);
1039
1040 while(height--) {
1041 int width = info->d_width;
1042 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1043 Uint32 pixel; \
1044 unsigned sR, sG, sB, dR, dG, dB; \
1045 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, pixel, sR, sG, sB); \
1046 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \
1047 ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1048 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1049 ++srcp; \
1050 ++dstp; \
1051 widthvar--; \
1052 }
1053 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1054 if (width > 0) {
1055 int extrawidth = (width % 4);
1056 vector unsigned char valigner = vec_lvsl(0, srcp);
1057 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1058 width -= extrawidth;
1059 while (width) {
1060 vector unsigned char voverflow;
1061 vector unsigned char vd;
1062
1063 /* s = *srcp */
1064 voverflow = (vector unsigned char)vec_ld(15, srcp);
1065 vs = vec_perm(vs, voverflow, valigner);
1066 vs = vec_perm(vs, valpha, vsrcPermute);
1067
1068 /* d = *dstp */
1069 vd = (vector unsigned char)vec_ld(0, dstp);
1070 vd = vec_perm(vd, vd, vsdstPermute);
1071
1072 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1073
1074 /* set the alpha channel to full on */
1075 vd = vec_or(vd, valphamask);
1076 vd = vec_perm(vd, vbits, vdstPermute);
1077
1078 /* *dstp = res */
1079 vec_st((vector unsigned int)vd, 0, dstp);
1080
1081 srcp += 4;
1082 dstp += 4;
1083 width -= 4;
1084 vs = voverflow;
1085 }
1086 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1087 }
1088 #undef ONE_PIXEL_BLEND
1089
1090 srcp += srcskip;
1091 dstp += dstskip;
1092 }
1093
1094 }
1095
1096
1097 /* fast RGB888->(A)RGB888 blending */
1098 static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
1099 {
1100 unsigned alpha = info->src->alpha;
1101 int height = info->d_height;
1102 Uint32 *srcp = (Uint32 *)info->s_pixels;
1103 int srcskip = info->s_skip >> 2;
1104 Uint32 *dstp = (Uint32 *)info->d_pixels;
1105 int dstskip = info->d_skip >> 2;
1106 vector unsigned char mergePermute;
1107 vector unsigned char valpha;
1108 vector unsigned char valphamask;
1109 vector unsigned short v1;
1110 vector unsigned short v8;
1111
1112 mergePermute = VEC_MERGE_PERMUTE();
1113 v1 = vec_splat_u16(1);
1114 v8 = vec_splat_u16(8);
1115
1116 /* set the alpha to 255 on the destination surf */
1117 valphamask = VEC_ALPHA_MASK();
1118
1119 /* set a vector full of alpha and 255-alpha */
1120 ((unsigned char *)&valpha)[0] = alpha;
1121 valpha = vec_splat(valpha, 0);
1122
1123 while(height--) {
1124 int width = info->d_width;
1125 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1126 Uint32 s = *srcp; \
1127 Uint32 d = *dstp; \
1128 Uint32 s1 = s & 0xff00ff; \
1129 Uint32 d1 = d & 0xff00ff; \
1130 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1131 & 0xff00ff; \
1132 s &= 0xff00; \
1133 d &= 0xff00; \
1134 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1135 *dstp = d1 | d | 0xff000000; \
1136 ++srcp; \
1137 ++dstp; \
1138 widthvar--; \
1139 }
1140 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1141 if (width > 0) {
1142 int extrawidth = (width % 4);
1143 vector unsigned char valigner = VEC_ALIGNER(srcp);
1144 vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
1145 width -= extrawidth;
1146 while (width) {
1147 vector unsigned char voverflow;
1148 vector unsigned char vd;
1149
1150 /* s = *srcp */
1151 voverflow = (vector unsigned char)vec_ld(15, srcp);
1152 vs = vec_perm(vs, voverflow, valigner);
1153
1154 /* d = *dstp */
1155 vd = (vector unsigned char)vec_ld(0, dstp);
1156
1157 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1158
1159 /* set the alpha channel to full on */
1160 vd = vec_or(vd, valphamask);
1161
1162 /* *dstp = res */
1163 vec_st((vector unsigned int)vd, 0, dstp);
1164
1165 srcp += 4;
1166 dstp += 4;
1167 width -= 4;
1168 vs = voverflow;
1169 }
1170 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1171 }
1172 #undef ONE_PIXEL_BLEND
1173
1174 srcp += srcskip;
1175 dstp += dstskip;
1176 }
1177 }
1178 #endif /* USE_ALTIVEC_BLITTERS */
1179
424 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 1180 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
425 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) 1181 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
426 { 1182 {
427 int width = info->d_width; 1183 int width = info->d_width;
428 int height = info->d_height; 1184 int height = info->d_height;
1370 if(sf->Amask == 0) { 2126 if(sf->Amask == 0) {
1371 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) { 2127 if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
1372 if(df->BytesPerPixel == 1) 2128 if(df->BytesPerPixel == 1)
1373 return BlitNto1SurfaceAlphaKey; 2129 return BlitNto1SurfaceAlphaKey;
1374 else 2130 else
1375 return BlitNtoNSurfaceAlphaKey; 2131 #ifdef USE_ALTIVEC_BLITTERS
2132 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && SDL_HasAltiVec())
2133 return Blit32to32SurfaceAlphaKeyAltivec;
2134 else
2135 #endif
2136 return BlitNtoNSurfaceAlphaKey;
1376 } else { 2137 } else {
1377 /* Per-surface alpha blits */ 2138 /* Per-surface alpha blits */
1378 switch(df->BytesPerPixel) { 2139 switch(df->BytesPerPixel) {
1379 case 1: 2140 case 1:
1380 return BlitNto1SurfaceAlpha; 2141 return BlitNto1SurfaceAlpha;
1412 #ifdef MMX_ASMBLIT 2173 #ifdef MMX_ASMBLIT
1413 if(SDL_HasMMX()) 2174 if(SDL_HasMMX())
1414 return BlitRGBtoRGBSurfaceAlphaMMX; 2175 return BlitRGBtoRGBSurfaceAlphaMMX;
1415 else 2176 else
1416 #endif 2177 #endif
2178 #ifdef USE_ALTIVEC_BLITTERS
2179 if(SDL_HasAltiVec())
2180 return BlitRGBtoRGBSurfaceAlphaAltivec;
2181 else
2182 #endif
1417 return BlitRGBtoRGBSurfaceAlpha; 2183 return BlitRGBtoRGBSurfaceAlpha;
1418 } 2184 }
1419 else 2185 else
2186 #ifdef USE_ALTIVEC_BLITTERS
2187 if((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
2188 return Blit32to32SurfaceAlphaAltivec;
2189 else
2190 #endif
1420 return BlitNtoNSurfaceAlpha; 2191 return BlitNtoNSurfaceAlpha;
1421 2192
1422 case 3: 2193 case 3:
1423 default: 2194 default:
1424 return BlitNtoNSurfaceAlpha; 2195 return BlitNtoNSurfaceAlpha;
1429 switch(df->BytesPerPixel) { 2200 switch(df->BytesPerPixel) {
1430 case 1: 2201 case 1:
1431 return BlitNto1PixelAlpha; 2202 return BlitNto1PixelAlpha;
1432 2203
1433 case 2: 2204 case 2:
2205 #ifdef USE_ALTIVEC_BLITTERS
2206 if(sf->BytesPerPixel == 4 &&
2207 df->Gmask == 0x7e0 &&
2208 df->Bmask == 0x1f)
2209 return Blit32to565PixelAlphaAltivec;
2210 else
2211 #endif
1434 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000 2212 if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1435 && sf->Gmask == 0xff00 2213 && sf->Gmask == 0xff00
1436 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) 2214 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1437 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { 2215 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1438 if(df->Gmask == 0x7e0) 2216 if(df->Gmask == 0x7e0)
1455 else 2233 else
1456 if(SDL_HasMMX()) 2234 if(SDL_HasMMX())
1457 return BlitRGBtoRGBPixelAlphaMMX; 2235 return BlitRGBtoRGBPixelAlphaMMX;
1458 else 2236 else
1459 #endif 2237 #endif
2238 #ifdef USE_ALTIVEC_BLITTERS
2239 if(SDL_HasAltiVec())
2240 return BlitRGBtoRGBPixelAlphaAltivec;
2241 else
2242 #endif
1460 return BlitRGBtoRGBPixelAlpha; 2243 return BlitRGBtoRGBPixelAlpha;
1461 } 2244 }
2245 #ifdef USE_ALTIVEC_BLITTERS
2246 if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
2247 return Blit32to32PixelAlphaAltivec;
2248 else
2249 #endif
1462 return BlitNtoNPixelAlpha; 2250 return BlitNtoNPixelAlpha;
1463 2251
1464 case 3: 2252 case 3:
1465 default: 2253 default:
1466 return BlitNtoNPixelAlpha; 2254 return BlitNtoNPixelAlpha;