comparison src/video/SDL_blit_A.c @ 5264:6a65c1fc07af

Updated CPU detection code for SSE3 and SSE4 and removed obsolete 3DNow! and Altivec support.
author Sam Lantinga <slouken@libsdl.org>
date Fri, 11 Feb 2011 14:51:04 -0800
parents f7b03b6838cb
children b530ef003506
comparison
equal deleted inserted replaced
5263:f26314c20071 5264:6a65c1fc07af
417 _mm_empty(); 417 _mm_empty();
418 } 418 }
419 419
420 #endif /* __MMX__ */ 420 #endif /* __MMX__ */
421 421
422 #if SDL_ALTIVEC_BLITTERS
423 #if __MWERKS__
424 #pragma altivec_model on
425 #endif
426 #if HAVE_ALTIVEC_H
427 #include <altivec.h>
428 #endif
429 #include <assert.h>
430
431 #if (defined(__MACOSX__) && (__GNUC__ < 4))
432 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
433 (vector unsigned char) ( a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p )
434 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
435 (vector unsigned short) ( a,b,c,d,e,f,g,h )
436 #else
437 #define VECUINT8_LITERAL(a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
438 (vector unsigned char) { a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p }
439 #define VECUINT16_LITERAL(a,b,c,d,e,f,g,h) \
440 (vector unsigned short) { a,b,c,d,e,f,g,h }
441 #endif
442
443 #define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
444 #define VECPRINT(msg, v) do { \
445 vector unsigned int tmpvec = (vector unsigned int)(v); \
446 unsigned int *vp = (unsigned int *)&tmpvec; \
447 printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
448 } while (0)
449
450 /* the permuation vector that takes the high bytes out of all the appropriate shorts
451 (vector unsigned char)(
452 0x00, 0x10, 0x02, 0x12,
453 0x04, 0x14, 0x06, 0x16,
454 0x08, 0x18, 0x0A, 0x1A,
455 0x0C, 0x1C, 0x0E, 0x1E );
456 */
457 #define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
458 #define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
459 #define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
460 #define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
461 ? vec_lvsl(0, src) \
462 : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
463
464
465 #define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
466 /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
467 vector unsigned short vtemp1 = vec_mule(vs, valpha); \
468 /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
469 vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
470 /* valpha2 is 255-alpha */ \
471 vector unsigned char valpha2 = vec_nor(valpha, valpha); \
472 /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
473 vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
474 /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
475 vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
476 /* add source and dest */ \
477 vtemp1 = vec_add(vtemp1, vtemp3); \
478 vtemp2 = vec_add(vtemp2, vtemp4); \
479 /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
480 vtemp1 = vec_add(vtemp1, v1_16); \
481 vtemp3 = vec_sr(vtemp1, v8_16); \
482 vtemp1 = vec_add(vtemp1, vtemp3); \
483 /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
484 vtemp2 = vec_add(vtemp2, v1_16); \
485 vtemp4 = vec_sr(vtemp2, v8_16); \
486 vtemp2 = vec_add(vtemp2, vtemp4); \
487 /* (>>8) and get ARGBARGBARGBARGB */ \
488 vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
489 } while (0)
490
491 /* Calculate the permute vector used for 32->32 swizzling */
492 static vector unsigned char
493 calc_swizzle32(const SDL_PixelFormat * srcfmt, const SDL_PixelFormat * dstfmt)
494 {
495 /*
496 * We have to assume that the bits that aren't used by other
497 * colors is alpha, and it's one complete byte, since some formats
498 * leave alpha with a zero mask, but we should still swizzle the bits.
499 */
500 /* ARGB */
501 const static struct SDL_PixelFormat default_pixel_format = {
502 NULL, 0, 0,
503 0, 0, 0, 0,
504 16, 8, 0, 24,
505 0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000
506 };
507 if (!srcfmt) {
508 srcfmt = &default_pixel_format;
509 }
510 if (!dstfmt) {
511 dstfmt = &default_pixel_format;
512 }
513 const vector unsigned char plus = VECUINT8_LITERAL(0x00, 0x00, 0x00, 0x00,
514 0x04, 0x04, 0x04, 0x04,
515 0x08, 0x08, 0x08, 0x08,
516 0x0C, 0x0C, 0x0C,
517 0x0C);
518 vector unsigned char vswiz;
519 vector unsigned int srcvec;
520 #define RESHIFT(X) (3 - ((X) >> 3))
521 Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
522 Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
523 Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
524 Uint32 amask;
525 /* Use zero for alpha if either surface doesn't have alpha */
526 if (dstfmt->Amask) {
527 amask =
528 ((srcfmt->Amask) ? RESHIFT(srcfmt->
529 Ashift) : 0x10) << (dstfmt->Ashift);
530 } else {
531 amask =
532 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^
533 0xFFFFFFFF);
534 }
535 #undef RESHIFT
536 ((unsigned int *) (char *) &srcvec)[0] = (rmask | gmask | bmask | amask);
537 vswiz = vec_add(plus, (vector unsigned char) vec_splat(srcvec, 0));
538 return (vswiz);
539 }
540
541 static void
542 Blit32to565PixelAlphaAltivec(SDL_BlitInfo * info)
543 {
544 int height = info->dst_h;
545 Uint8 *src = (Uint8 *) info->src;
546 int srcskip = info->src_skip;
547 Uint8 *dst = (Uint8 *) info->dst;
548 int dstskip = info->dst_skip;
549 SDL_PixelFormat *srcfmt = info->src_fmt;
550
551 vector unsigned char v0 = vec_splat_u8(0);
552 vector unsigned short v8_16 = vec_splat_u16(8);
553 vector unsigned short v1_16 = vec_splat_u16(1);
554 vector unsigned short v2_16 = vec_splat_u16(2);
555 vector unsigned short v3_16 = vec_splat_u16(3);
556 vector unsigned int v8_32 = vec_splat_u32(8);
557 vector unsigned int v16_32 = vec_add(v8_32, v8_32);
558 vector unsigned short v3f =
559 VECUINT16_LITERAL(0x003f, 0x003f, 0x003f, 0x003f,
560 0x003f, 0x003f, 0x003f, 0x003f);
561 vector unsigned short vfc =
562 VECUINT16_LITERAL(0x00fc, 0x00fc, 0x00fc, 0x00fc,
563 0x00fc, 0x00fc, 0x00fc, 0x00fc);
564
565 /*
566 0x10 - 0x1f is the alpha
567 0x00 - 0x0e evens are the red
568 0x01 - 0x0f odds are zero
569 */
570 vector unsigned char vredalpha1 = VECUINT8_LITERAL(0x10, 0x00, 0x01, 0x01,
571 0x10, 0x02, 0x01, 0x01,
572 0x10, 0x04, 0x01, 0x01,
573 0x10, 0x06, 0x01,
574 0x01);
575 vector unsigned char vredalpha2 =
576 (vector unsigned char) (vec_add((vector unsigned int) vredalpha1,
577 vec_sl(v8_32, v16_32))
578 );
579 /*
580 0x00 - 0x0f is ARxx ARxx ARxx ARxx
581 0x11 - 0x0f odds are blue
582 */
583 vector unsigned char vblue1 = VECUINT8_LITERAL(0x00, 0x01, 0x02, 0x11,
584 0x04, 0x05, 0x06, 0x13,
585 0x08, 0x09, 0x0a, 0x15,
586 0x0c, 0x0d, 0x0e, 0x17);
587 vector unsigned char vblue2 =
588 (vector unsigned char) (vec_add((vector unsigned int) vblue1, v8_32)
589 );
590 /*
591 0x00 - 0x0f is ARxB ARxB ARxB ARxB
592 0x10 - 0x0e evens are green
593 */
594 vector unsigned char vgreen1 = VECUINT8_LITERAL(0x00, 0x01, 0x10, 0x03,
595 0x04, 0x05, 0x12, 0x07,
596 0x08, 0x09, 0x14, 0x0b,
597 0x0c, 0x0d, 0x16, 0x0f);
598 vector unsigned char vgreen2 =
599 (vector unsigned
600 char) (vec_add((vector unsigned int) vgreen1, vec_sl(v8_32, v8_32))
601 );
602 vector unsigned char vgmerge = VECUINT8_LITERAL(0x00, 0x02, 0x00, 0x06,
603 0x00, 0x0a, 0x00, 0x0e,
604 0x00, 0x12, 0x00, 0x16,
605 0x00, 0x1a, 0x00, 0x1e);
606 vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
607 vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
608 vector unsigned char valphaPermute =
609 vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
610
611 vector unsigned short vf800 = (vector unsigned short) vec_splat_u8(-7);
612 vf800 = vec_sl(vf800, vec_splat_u16(8));
613
614 while (height--) {
615 int extrawidth;
616 vector unsigned char valigner;
617 vector unsigned char vsrc;
618 vector unsigned char voverflow;
619 int width = info->dst_w;
620
621 #define ONE_PIXEL_BLEND(condition, widthvar) \
622 while (condition) { \
623 Uint32 Pixel; \
624 unsigned sR, sG, sB, dR, dG, dB, sA; \
625 DISEMBLE_RGBA(src, 4, srcfmt, Pixel, sR, sG, sB, sA); \
626 if(sA) { \
627 unsigned short dstpixel = *((unsigned short *)dst); \
628 dR = (dstpixel >> 8) & 0xf8; \
629 dG = (dstpixel >> 3) & 0xfc; \
630 dB = (dstpixel << 3) & 0xf8; \
631 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
632 *((unsigned short *)dst) = ( \
633 ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
634 ); \
635 } \
636 src += 4; \
637 dst += 2; \
638 widthvar--; \
639 }
640 ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
641 extrawidth = (width % 8);
642 valigner = VEC_ALIGNER(src);
643 vsrc = (vector unsigned char) vec_ld(0, src);
644 width -= extrawidth;
645 while (width) {
646 vector unsigned char valpha;
647 vector unsigned char vsrc1, vsrc2;
648 vector unsigned char vdst1, vdst2;
649 vector unsigned short vR, vG, vB;
650 vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
651
652 /* Load 8 pixels from src as ARGB */
653 voverflow = (vector unsigned char) vec_ld(15, src);
654 vsrc = vec_perm(vsrc, voverflow, valigner);
655 vsrc1 = vec_perm(vsrc, vsrc, vpermute);
656 src += 16;
657 vsrc = (vector unsigned char) vec_ld(15, src);
658 voverflow = vec_perm(voverflow, vsrc, valigner);
659 vsrc2 = vec_perm(voverflow, voverflow, vpermute);
660 src += 16;
661
662 /* Load 8 pixels from dst as XRGB */
663 voverflow = vec_ld(0, dst);
664 vR = vec_and((vector unsigned short) voverflow, vf800);
665 vB = vec_sl((vector unsigned short) voverflow, v3_16);
666 vG = vec_sl(vB, v2_16);
667 vdst1 =
668 (vector unsigned char) vec_perm((vector unsigned char) vR,
669 (vector unsigned char) vR,
670 vredalpha1);
671 vdst1 = vec_perm(vdst1, (vector unsigned char) vB, vblue1);
672 vdst1 = vec_perm(vdst1, (vector unsigned char) vG, vgreen1);
673 vdst2 =
674 (vector unsigned char) vec_perm((vector unsigned char) vR,
675 (vector unsigned char) vR,
676 vredalpha2);
677 vdst2 = vec_perm(vdst2, (vector unsigned char) vB, vblue2);
678 vdst2 = vec_perm(vdst2, (vector unsigned char) vG, vgreen2);
679
680 /* Alpha blend 8 pixels as ARGB */
681 valpha = vec_perm(vsrc1, v0, valphaPermute);
682 VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16,
683 v8_16);
684 valpha = vec_perm(vsrc2, v0, valphaPermute);
685 VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16,
686 v8_16);
687
688 /* Convert 8 pixels to 565 */
689 vpixel = (vector unsigned short) vec_packpx((vector unsigned int)
690 vdst1,
691 (vector unsigned int)
692 vdst2);
693 vgpixel = (vector unsigned short) vec_perm(vdst1, vdst2, vgmerge);
694 vgpixel = vec_and(vgpixel, vfc);
695 vgpixel = vec_sl(vgpixel, v3_16);
696 vrpixel = vec_sl(vpixel, v1_16);
697 vrpixel = vec_and(vrpixel, vf800);
698 vbpixel = vec_and(vpixel, v3f);
699 vdst1 =
700 vec_or((vector unsigned char) vrpixel,
701 (vector unsigned char) vgpixel);
702 vdst1 = vec_or(vdst1, (vector unsigned char) vbpixel);
703
704 /* Store 8 pixels */
705 vec_st(vdst1, 0, dst);
706
707 width -= 8;
708 dst += 16;
709 }
710 ONE_PIXEL_BLEND((extrawidth), extrawidth);
711 #undef ONE_PIXEL_BLEND
712 src += srcskip;
713 dst += dstskip;
714 }
715 }
716
717 static void
718 Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo * info)
719 {
720 int height = info->dst_h;
721 Uint32 *srcp = (Uint32 *) info->src;
722 int srcskip = info->src_skip >> 2;
723 Uint32 *dstp = (Uint32 *) info->dst;
724 int dstskip = info->dst_skip >> 2;
725 SDL_PixelFormat *srcfmt = info->src_fmt;
726 SDL_PixelFormat *dstfmt = info->dst_fmt;
727 unsigned sA = info->a;
728 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
729 Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
730 Uint32 ckey = info->colorkey;
731 vector unsigned char mergePermute;
732 vector unsigned char vsrcPermute;
733 vector unsigned char vdstPermute;
734 vector unsigned char vsdstPermute;
735 vector unsigned char valpha;
736 vector unsigned char valphamask;
737 vector unsigned char vbits;
738 vector unsigned char v0;
739 vector unsigned short v1;
740 vector unsigned short v8;
741 vector unsigned int vckey;
742 vector unsigned int vrgbmask;
743
744 mergePermute = VEC_MERGE_PERMUTE();
745 v0 = vec_splat_u8(0);
746 v1 = vec_splat_u16(1);
747 v8 = vec_splat_u16(8);
748
749 /* set the alpha to 255 on the destination surf */
750 valphamask = VEC_ALPHA_MASK();
751
752 vsrcPermute = calc_swizzle32(srcfmt, NULL);
753 vdstPermute = calc_swizzle32(NULL, dstfmt);
754 vsdstPermute = calc_swizzle32(dstfmt, NULL);
755
756 /* set a vector full of alpha and 255-alpha */
757 ((unsigned char *) &valpha)[0] = sA;
758 valpha = vec_splat(valpha, 0);
759 vbits = (vector unsigned char) vec_splat_s8(-1);
760
761 ckey &= rgbmask;
762 ((unsigned int *) (char *) &vckey)[0] = ckey;
763 vckey = vec_splat(vckey, 0);
764 ((unsigned int *) (char *) &vrgbmask)[0] = rgbmask;
765 vrgbmask = vec_splat(vrgbmask, 0);
766
767 while (height--) {
768 int width = info->dst_w;
769 #define ONE_PIXEL_BLEND(condition, widthvar) \
770 while (condition) { \
771 Uint32 Pixel; \
772 unsigned sR, sG, sB, dR, dG, dB; \
773 RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, Pixel); \
774 if(sA && Pixel != ckey) { \
775 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB); \
776 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
777 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
778 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
779 } \
780 dstp++; \
781 srcp++; \
782 widthvar--; \
783 }
784 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
785 if (width > 0) {
786 int extrawidth = (width % 4);
787 vector unsigned char valigner = VEC_ALIGNER(srcp);
788 vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
789 width -= extrawidth;
790 while (width) {
791 vector unsigned char vsel;
792 vector unsigned char voverflow;
793 vector unsigned char vd;
794 vector unsigned char vd_orig;
795
796 /* s = *srcp */
797 voverflow = (vector unsigned char) vec_ld(15, srcp);
798 vs = vec_perm(vs, voverflow, valigner);
799
800 /* vsel is set for items that match the key */
801 vsel =
802 (vector unsigned char) vec_and((vector unsigned int) vs,
803 vrgbmask);
804 vsel = (vector unsigned char) vec_cmpeq((vector unsigned int)
805 vsel, vckey);
806
807 /* permute to source format */
808 vs = vec_perm(vs, valpha, vsrcPermute);
809
810 /* d = *dstp */
811 vd = (vector unsigned char) vec_ld(0, dstp);
812 vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
813
814 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
815
816 /* set the alpha channel to full on */
817 vd = vec_or(vd, valphamask);
818
819 /* mask out color key */
820 vd = vec_sel(vd, vd_orig, vsel);
821
822 /* permute to dest format */
823 vd = vec_perm(vd, vbits, vdstPermute);
824
825 /* *dstp = res */
826 vec_st((vector unsigned int) vd, 0, dstp);
827
828 srcp += 4;
829 dstp += 4;
830 width -= 4;
831 vs = voverflow;
832 }
833 ONE_PIXEL_BLEND((extrawidth), extrawidth);
834 }
835 #undef ONE_PIXEL_BLEND
836
837 srcp += srcskip;
838 dstp += dstskip;
839 }
840 }
841
842
843 static void
844 Blit32to32PixelAlphaAltivec(SDL_BlitInfo * info)
845 {
846 int width = info->dst_w;
847 int height = info->dst_h;
848 Uint32 *srcp = (Uint32 *) info->src;
849 int srcskip = info->src_skip >> 2;
850 Uint32 *dstp = (Uint32 *) info->dst;
851 int dstskip = info->dst_skip >> 2;
852 SDL_PixelFormat *srcfmt = info->src_fmt;
853 SDL_PixelFormat *dstfmt = info->dst_fmt;
854 vector unsigned char mergePermute;
855 vector unsigned char valphaPermute;
856 vector unsigned char vsrcPermute;
857 vector unsigned char vdstPermute;
858 vector unsigned char vsdstPermute;
859 vector unsigned char valphamask;
860 vector unsigned char vpixelmask;
861 vector unsigned char v0;
862 vector unsigned short v1;
863 vector unsigned short v8;
864
865 v0 = vec_splat_u8(0);
866 v1 = vec_splat_u16(1);
867 v8 = vec_splat_u16(8);
868 mergePermute = VEC_MERGE_PERMUTE();
869 valphamask = VEC_ALPHA_MASK();
870 valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
871 vpixelmask = vec_nor(valphamask, v0);
872 vsrcPermute = calc_swizzle32(srcfmt, NULL);
873 vdstPermute = calc_swizzle32(NULL, dstfmt);
874 vsdstPermute = calc_swizzle32(dstfmt, NULL);
875
876 while (height--) {
877 width = info->dst_w;
878 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
879 Uint32 Pixel; \
880 unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
881 DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, Pixel, sR, sG, sB, sA); \
882 if(sA) { \
883 DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, Pixel, dR, dG, dB, dA); \
884 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
885 ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
886 } \
887 ++srcp; \
888 ++dstp; \
889 widthvar--; \
890 }
891 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
892 if (width > 0) {
893 /* vsrcPermute */
894 /* vdstPermute */
895 int extrawidth = (width % 4);
896 vector unsigned char valigner = VEC_ALIGNER(srcp);
897 vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
898 width -= extrawidth;
899 while (width) {
900 vector unsigned char voverflow;
901 vector unsigned char vd;
902 vector unsigned char valpha;
903 vector unsigned char vdstalpha;
904 /* s = *srcp */
905 voverflow = (vector unsigned char) vec_ld(15, srcp);
906 vs = vec_perm(vs, voverflow, valigner);
907 vs = vec_perm(vs, v0, vsrcPermute);
908
909 valpha = vec_perm(vs, v0, valphaPermute);
910
911 /* d = *dstp */
912 vd = (vector unsigned char) vec_ld(0, dstp);
913 vd = vec_perm(vd, v0, vsdstPermute);
914 vdstalpha = vec_and(vd, valphamask);
915
916 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
917
918 /* set the alpha to the dest alpha */
919 vd = vec_and(vd, vpixelmask);
920 vd = vec_or(vd, vdstalpha);
921 vd = vec_perm(vd, v0, vdstPermute);
922
923 /* *dstp = res */
924 vec_st((vector unsigned int) vd, 0, dstp);
925
926 srcp += 4;
927 dstp += 4;
928 width -= 4;
929 vs = voverflow;
930
931 }
932 ONE_PIXEL_BLEND((extrawidth), extrawidth);
933 }
934 srcp += srcskip;
935 dstp += dstskip;
936 #undef ONE_PIXEL_BLEND
937 }
938 }
939
940 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
941 static void
942 BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo * info)
943 {
944 int width = info->dst_w;
945 int height = info->dst_h;
946 Uint32 *srcp = (Uint32 *) info->src;
947 int srcskip = info->src_skip >> 2;
948 Uint32 *dstp = (Uint32 *) info->dst;
949 int dstskip = info->dst_skip >> 2;
950 vector unsigned char mergePermute;
951 vector unsigned char valphaPermute;
952 vector unsigned char valphamask;
953 vector unsigned char vpixelmask;
954 vector unsigned char v0;
955 vector unsigned short v1;
956 vector unsigned short v8;
957 v0 = vec_splat_u8(0);
958 v1 = vec_splat_u16(1);
959 v8 = vec_splat_u16(8);
960 mergePermute = VEC_MERGE_PERMUTE();
961 valphamask = VEC_ALPHA_MASK();
962 valphaPermute = vec_and(vec_lvsl(0, (int *) NULL), vec_splat_u8(0xC));
963
964
965 vpixelmask = vec_nor(valphamask, v0);
966 while (height--) {
967 width = info->dst_w;
968 #define ONE_PIXEL_BLEND(condition, widthvar) \
969 while ((condition)) { \
970 Uint32 dalpha; \
971 Uint32 d; \
972 Uint32 s1; \
973 Uint32 d1; \
974 Uint32 s = *srcp; \
975 Uint32 alpha = s >> 24; \
976 if(alpha) { \
977 if(alpha == SDL_ALPHA_OPAQUE) { \
978 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
979 } else { \
980 d = *dstp; \
981 dalpha = d & 0xff000000; \
982 s1 = s & 0xff00ff; \
983 d1 = d & 0xff00ff; \
984 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
985 s &= 0xff00; \
986 d &= 0xff00; \
987 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
988 *dstp = d1 | d | dalpha; \
989 } \
990 } \
991 ++srcp; \
992 ++dstp; \
993 widthvar--; \
994 }
995 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
996 if (width > 0) {
997 int extrawidth = (width % 4);
998 vector unsigned char valigner = VEC_ALIGNER(srcp);
999 vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
1000 width -= extrawidth;
1001 while (width) {
1002 vector unsigned char voverflow;
1003 vector unsigned char vd;
1004 vector unsigned char valpha;
1005 vector unsigned char vdstalpha;
1006 /* s = *srcp */
1007 voverflow = (vector unsigned char) vec_ld(15, srcp);
1008 vs = vec_perm(vs, voverflow, valigner);
1009
1010 valpha = vec_perm(vs, v0, valphaPermute);
1011
1012 /* d = *dstp */
1013 vd = (vector unsigned char) vec_ld(0, dstp);
1014 vdstalpha = vec_and(vd, valphamask);
1015
1016 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1017
1018 /* set the alpha to the dest alpha */
1019 vd = vec_and(vd, vpixelmask);
1020 vd = vec_or(vd, vdstalpha);
1021
1022 /* *dstp = res */
1023 vec_st((vector unsigned int) vd, 0, dstp);
1024
1025 srcp += 4;
1026 dstp += 4;
1027 width -= 4;
1028 vs = voverflow;
1029 }
1030 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1031 }
1032 srcp += srcskip;
1033 dstp += dstskip;
1034 }
1035 #undef ONE_PIXEL_BLEND
1036 }
1037
1038 static void
1039 Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo * info)
1040 {
1041 /* XXX : 6 */
1042 int height = info->dst_h;
1043 Uint32 *srcp = (Uint32 *) info->src;
1044 int srcskip = info->src_skip >> 2;
1045 Uint32 *dstp = (Uint32 *) info->dst;
1046 int dstskip = info->dst_skip >> 2;
1047 SDL_PixelFormat *srcfmt = info->src_fmt;
1048 SDL_PixelFormat *dstfmt = info->dst_fmt;
1049 unsigned sA = info->a;
1050 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
1051 vector unsigned char mergePermute;
1052 vector unsigned char vsrcPermute;
1053 vector unsigned char vdstPermute;
1054 vector unsigned char vsdstPermute;
1055 vector unsigned char valpha;
1056 vector unsigned char valphamask;
1057 vector unsigned char vbits;
1058 vector unsigned short v1;
1059 vector unsigned short v8;
1060
1061 mergePermute = VEC_MERGE_PERMUTE();
1062 v1 = vec_splat_u16(1);
1063 v8 = vec_splat_u16(8);
1064
1065 /* set the alpha to 255 on the destination surf */
1066 valphamask = VEC_ALPHA_MASK();
1067
1068 vsrcPermute = calc_swizzle32(srcfmt, NULL);
1069 vdstPermute = calc_swizzle32(NULL, dstfmt);
1070 vsdstPermute = calc_swizzle32(dstfmt, NULL);
1071
1072 /* set a vector full of alpha and 255-alpha */
1073 ((unsigned char *) &valpha)[0] = sA;
1074 valpha = vec_splat(valpha, 0);
1075 vbits = (vector unsigned char) vec_splat_s8(-1);
1076
1077 while (height--) {
1078 int width = info->dst_w;
1079 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1080 Uint32 Pixel; \
1081 unsigned sR, sG, sB, dR, dG, dB; \
1082 DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, Pixel, sR, sG, sB); \
1083 DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, Pixel, dR, dG, dB); \
1084 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
1085 ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
1086 ++srcp; \
1087 ++dstp; \
1088 widthvar--; \
1089 }
1090 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1091 if (width > 0) {
1092 int extrawidth = (width % 4);
1093 vector unsigned char valigner = VEC_ALIGNER(srcp);
1094 vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
1095 width -= extrawidth;
1096 while (width) {
1097 vector unsigned char voverflow;
1098 vector unsigned char vd;
1099
1100 /* s = *srcp */
1101 voverflow = (vector unsigned char) vec_ld(15, srcp);
1102 vs = vec_perm(vs, voverflow, valigner);
1103 vs = vec_perm(vs, valpha, vsrcPermute);
1104
1105 /* d = *dstp */
1106 vd = (vector unsigned char) vec_ld(0, dstp);
1107 vd = vec_perm(vd, vd, vsdstPermute);
1108
1109 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1110
1111 /* set the alpha channel to full on */
1112 vd = vec_or(vd, valphamask);
1113 vd = vec_perm(vd, vbits, vdstPermute);
1114
1115 /* *dstp = res */
1116 vec_st((vector unsigned int) vd, 0, dstp);
1117
1118 srcp += 4;
1119 dstp += 4;
1120 width -= 4;
1121 vs = voverflow;
1122 }
1123 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1124 }
1125 #undef ONE_PIXEL_BLEND
1126
1127 srcp += srcskip;
1128 dstp += dstskip;
1129 }
1130
1131 }
1132
1133
1134 /* fast RGB888->(A)RGB888 blending */
1135 static void
1136 BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo * info)
1137 {
1138 unsigned alpha = info->a;
1139 int height = info->dst_h;
1140 Uint32 *srcp = (Uint32 *) info->src;
1141 int srcskip = info->src_skip >> 2;
1142 Uint32 *dstp = (Uint32 *) info->dst;
1143 int dstskip = info->dst_skip >> 2;
1144 vector unsigned char mergePermute;
1145 vector unsigned char valpha;
1146 vector unsigned char valphamask;
1147 vector unsigned short v1;
1148 vector unsigned short v8;
1149
1150 mergePermute = VEC_MERGE_PERMUTE();
1151 v1 = vec_splat_u16(1);
1152 v8 = vec_splat_u16(8);
1153
1154 /* set the alpha to 255 on the destination surf */
1155 valphamask = VEC_ALPHA_MASK();
1156
1157 /* set a vector full of alpha and 255-alpha */
1158 ((unsigned char *) &valpha)[0] = alpha;
1159 valpha = vec_splat(valpha, 0);
1160
1161 while (height--) {
1162 int width = info->dst_w;
1163 #define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
1164 Uint32 s = *srcp; \
1165 Uint32 d = *dstp; \
1166 Uint32 s1 = s & 0xff00ff; \
1167 Uint32 d1 = d & 0xff00ff; \
1168 d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
1169 & 0xff00ff; \
1170 s &= 0xff00; \
1171 d &= 0xff00; \
1172 d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
1173 *dstp = d1 | d | 0xff000000; \
1174 ++srcp; \
1175 ++dstp; \
1176 widthvar--; \
1177 }
1178 ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
1179 if (width > 0) {
1180 int extrawidth = (width % 4);
1181 vector unsigned char valigner = VEC_ALIGNER(srcp);
1182 vector unsigned char vs = (vector unsigned char) vec_ld(0, srcp);
1183 width -= extrawidth;
1184 while (width) {
1185 vector unsigned char voverflow;
1186 vector unsigned char vd;
1187
1188 /* s = *srcp */
1189 voverflow = (vector unsigned char) vec_ld(15, srcp);
1190 vs = vec_perm(vs, voverflow, valigner);
1191
1192 /* d = *dstp */
1193 vd = (vector unsigned char) vec_ld(0, dstp);
1194
1195 VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
1196
1197 /* set the alpha channel to full on */
1198 vd = vec_or(vd, valphamask);
1199
1200 /* *dstp = res */
1201 vec_st((vector unsigned int) vd, 0, dstp);
1202
1203 srcp += 4;
1204 dstp += 4;
1205 width -= 4;
1206 vs = voverflow;
1207 }
1208 ONE_PIXEL_BLEND((extrawidth), extrawidth);
1209 }
1210 #undef ONE_PIXEL_BLEND
1211
1212 srcp += srcskip;
1213 dstp += dstskip;
1214 }
1215 }
1216
1217 #if __MWERKS__
1218 #pragma altivec_model off
1219 #endif
1220 #endif /* SDL_ALTIVEC_BLITTERS */
1221
1222 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 422 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
1223 static void 423 static void
1224 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info) 424 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
1225 { 425 {
1226 int width = info->dst_w; 426 int width = info->dst_w;
1335 /* *INDENT-ON* */ 535 /* *INDENT-ON* */
1336 srcp += srcskip; 536 srcp += srcskip;
1337 dstp += dstskip; 537 dstp += dstskip;
1338 } 538 }
1339 } 539 }
1340
1341 #ifdef __3dNOW__
1342 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1343 static void
1344 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
1345 {
1346 int width = info->dst_w;
1347 int height = info->dst_h;
1348 Uint32 *srcp = (Uint32 *) info->src;
1349 int srcskip = info->src_skip >> 2;
1350 Uint32 *dstp = (Uint32 *) info->dst;
1351 int dstskip = info->dst_skip >> 2;
1352 SDL_PixelFormat *sf = info->src_fmt;
1353 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1354 Uint32 amask = sf->Amask;
1355 Uint32 ashift = sf->Ashift;
1356 Uint64 multmask;
1357
1358 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1359
1360 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1361 multmask = 0xFFFF;
1362 multmask <<= (ashift * 2);
1363 multmask = ~multmask;
1364 dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
1365
1366 while (height--) {
1367 /* *INDENT-OFF* */
1368 DUFFS_LOOP4({
1369 Uint32 alpha;
1370
1371 _m_prefetch(srcp + 16);
1372 _m_prefetch(dstp + 16);
1373
1374 alpha = *srcp & amask;
1375 if (alpha == 0) {
1376 /* do nothing */
1377 } else if (alpha == amask) {
1378 /* copy RGB, keep dst alpha */
1379 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1380 } else {
1381 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1382 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1383
1384 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1385 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1386
1387 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1388 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1389 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1390 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1391 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1392
1393 /* blend */
1394 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1395 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1396 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1397 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1398 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1399
1400 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1401 }
1402 ++srcp;
1403 ++dstp;
1404 }, width);
1405 /* *INDENT-ON* */
1406 srcp += srcskip;
1407 dstp += dstskip;
1408 }
1409 _mm_empty();
1410 }
1411
1412 #endif /* __MMX__ */
1413 540
1414 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ 541 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1415 542
1416 /* blend a single 16 bit pixel at 50% */ 543 /* blend a single 16 bit pixel at 50% */
1417 #define BLEND16_50(d, s, mask) \ 544 #define BLEND16_50(d, s, mask) \
2128 switch (df->BytesPerPixel) { 1255 switch (df->BytesPerPixel) {
2129 case 1: 1256 case 1:
2130 return BlitNto1PixelAlpha; 1257 return BlitNto1PixelAlpha;
2131 1258
2132 case 2: 1259 case 2:
2133 #if SDL_ALTIVEC_BLITTERS 1260 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2134 if (sf->BytesPerPixel == 4 1261 && sf->Gmask == 0xff00
2135 && df->Gmask == 0x7e0 && df->Bmask == 0x1f 1262 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2136 && SDL_HasAltiVec()) 1263 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2137 return Blit32to565PixelAlphaAltivec;
2138 else
2139 #endif
2140 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
2141 && sf->Gmask == 0xff00
2142 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
2143 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
2144 if (df->Gmask == 0x7e0) 1264 if (df->Gmask == 0x7e0)
2145 return BlitARGBto565PixelAlpha; 1265 return BlitARGBto565PixelAlpha;
2146 else if (df->Gmask == 0x3e0) 1266 else if (df->Gmask == 0x3e0)
2147 return BlitARGBto555PixelAlpha; 1267 return BlitARGBto555PixelAlpha;
2148 } 1268 }
2150 1270
2151 case 4: 1271 case 4:
2152 if (sf->Rmask == df->Rmask 1272 if (sf->Rmask == df->Rmask
2153 && sf->Gmask == df->Gmask 1273 && sf->Gmask == df->Gmask
2154 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { 1274 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
2155 #if defined(__MMX__) || defined(__3dNOW__) 1275 #if defined(__MMX__)
2156 if (sf->Rshift % 8 == 0 1276 if (sf->Rshift % 8 == 0
2157 && sf->Gshift % 8 == 0 1277 && sf->Gshift % 8 == 0
2158 && sf->Bshift % 8 == 0 1278 && sf->Bshift % 8 == 0
2159 && sf->Ashift % 8 == 0 && sf->Aloss == 0) { 1279 && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
2160 #ifdef __3dNOW__
2161 if (SDL_Has3DNow())
2162 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2163 #endif
2164 #ifdef __MMX__
2165 if (SDL_HasMMX()) 1280 if (SDL_HasMMX())
2166 return BlitRGBtoRGBPixelAlphaMMX; 1281 return BlitRGBtoRGBPixelAlphaMMX;
2167 #endif
2168 } 1282 }
2169 #endif /* __MMX__ || __3dNOW__ */ 1283 #endif /* __MMX__ */
2170 if (sf->Amask == 0xff000000) { 1284 if (sf->Amask == 0xff000000) {
2171 #if SDL_ALTIVEC_BLITTERS
2172 if (SDL_HasAltiVec())
2173 return BlitRGBtoRGBPixelAlphaAltivec;
2174 #endif
2175 return BlitRGBtoRGBPixelAlpha; 1285 return BlitRGBtoRGBPixelAlpha;
2176 } 1286 }
2177 } 1287 }
2178 #if SDL_ALTIVEC_BLITTERS 1288 return BlitNtoNPixelAlpha;
2179 if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
2180 return Blit32to32PixelAlphaAltivec;
2181 else
2182 #endif
2183 return BlitNtoNPixelAlpha;
2184 1289
2185 case 3: 1290 case 3:
2186 default: 1291 default:
2187 return BlitNtoNPixelAlpha; 1292 return BlitNtoNPixelAlpha;
2188 } 1293 }
2224 && sf->Gshift % 8 == 0 1329 && sf->Gshift % 8 == 0
2225 && sf->Bshift % 8 == 0 && SDL_HasMMX()) 1330 && sf->Bshift % 8 == 0 && SDL_HasMMX())
2226 return BlitRGBtoRGBSurfaceAlphaMMX; 1331 return BlitRGBtoRGBSurfaceAlphaMMX;
2227 #endif 1332 #endif
2228 if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) { 1333 if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
2229 #if SDL_ALTIVEC_BLITTERS
2230 if (SDL_HasAltiVec())
2231 return BlitRGBtoRGBSurfaceAlphaAltivec;
2232 #endif
2233 return BlitRGBtoRGBSurfaceAlpha; 1334 return BlitRGBtoRGBSurfaceAlpha;
2234 } 1335 }
2235 } 1336 }
2236 #if SDL_ALTIVEC_BLITTERS 1337 return BlitNtoNSurfaceAlpha;
2237 if ((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
2238 return Blit32to32SurfaceAlphaAltivec;
2239 else
2240 #endif
2241 return BlitNtoNSurfaceAlpha;
2242 1338
2243 case 3: 1339 case 3:
2244 default: 1340 default:
2245 return BlitNtoNSurfaceAlpha; 1341 return BlitNtoNSurfaceAlpha;
2246 } 1342 }
2250 case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND: 1346 case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
2251 if (sf->Amask == 0) { 1347 if (sf->Amask == 0) {
2252 if (df->BytesPerPixel == 1) 1348 if (df->BytesPerPixel == 1)
2253 return BlitNto1SurfaceAlphaKey; 1349 return BlitNto1SurfaceAlphaKey;
2254 else 1350 else
2255 #if SDL_ALTIVEC_BLITTERS
2256 if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 &&
2257 SDL_HasAltiVec())
2258 return Blit32to32SurfaceAlphaKeyAltivec;
2259 else
2260 #endif
2261 return BlitNtoNSurfaceAlphaKey; 1351 return BlitNtoNSurfaceAlphaKey;
2262 } 1352 }
2263 break; 1353 break;
2264 } 1354 }
2265 1355