comparison src/video/SDL_surface.c @ 2249:5a58b57b6724

Added SSE and MMX optimization for SDL_FillRect()
author Sam Lantinga <slouken@libsdl.org>
date Thu, 16 Aug 2007 05:56:24 +0000
parents 31835fd24b2b
children 292bee385630
comparison
equal deleted inserted replaced
2248:5cd2a2293cf0 2249:5a58b57b6724
507 } 507 }
508 dstrect->w = dstrect->h = 0; 508 dstrect->w = dstrect->h = 0;
509 return 0; 509 return 0;
510 } 510 }
511 511
512 static int 512 #ifdef __SSE__
513 SDL_FillRect1(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) 513 /* *INDENT-OFF* */
514 { 514
515 /* FIXME: We have to worry about packing order.. *sigh* */ 515 #define SSE_BEGIN \
516 SDL_SetError("1-bpp rect fill not yet implemented"); 516 DECLARE_ALIGNED(Uint32, cccc[4], 16); \
517 return -1; 517 cccc[0] = color; \
518 } 518 cccc[1] = color; \
519 519 cccc[2] = color; \
520 static int 520 cccc[3] = color; \
521 SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) 521 __m128 c128 = *(__m128 *)cccc;
522 { 522
523 /* FIXME: We have to worry about packing order.. *sigh* */ 523 #define SSE_WORK \
524 SDL_SetError("4-bpp rect fill not yet implemented"); 524 for (i = n / 64; i--;) { \
525 return -1; 525 _mm_stream_ps((float *)(p+0), c128); \
526 _mm_stream_ps((float *)(p+16), c128); \
527 _mm_stream_ps((float *)(p+32), c128); \
528 _mm_stream_ps((float *)(p+48), c128); \
529 p += 64; \
530 }
531
532 #define SSE_END
533
534 #define DEFINE_SSE_FILLRECT(bpp, type) \
535 static void \
536 SDL_FillRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
537 { \
538 SSE_BEGIN; \
539 \
540 while (h--) { \
541 int i, n = w * bpp; \
542 Uint8 *p = pixels; \
543 \
544 if (n > 15) { \
545 int adjust = 16 - ((uintptr_t)p & 15); \
546 if (adjust < 16) { \
547 n -= adjust; \
548 adjust /= bpp; \
549 while(adjust--) { \
550 *((type *)p) = (type)color; \
551 p += bpp; \
552 } \
553 } \
554 SSE_WORK; \
555 } \
556 if (n & 63) { \
557 int remainder = (n & 63); \
558 remainder /= bpp; \
559 while(remainder--) { \
560 *((type *)p) = (type)color; \
561 p += bpp; \
562 } \
563 } \
564 pixels += pitch; \
565 } \
566 \
567 SSE_END; \
568 }
569
570 DEFINE_SSE_FILLRECT(1, Uint8)
571 DEFINE_SSE_FILLRECT(2, Uint16)
572 DEFINE_SSE_FILLRECT(4, Uint32)
573
574 /* *INDENT-ON* */
575 #endif /* __SSE__ */
576
577 #ifdef __MMX__
578 /* *INDENT-OFF* */
579
580 #define MMX_BEGIN \
581 __m64 c64 = _mm_set_pi32(color, color)
582
583 #define MMX_WORK \
584 for (i = n / 64; i--;) { \
585 _mm_stream_pi((__m64 *)(p+0), c64); \
586 _mm_stream_pi((__m64 *)(p+8), c64); \
587 _mm_stream_pi((__m64 *)(p+16), c64); \
588 _mm_stream_pi((__m64 *)(p+24), c64); \
589 _mm_stream_pi((__m64 *)(p+32), c64); \
590 _mm_stream_pi((__m64 *)(p+40), c64); \
591 _mm_stream_pi((__m64 *)(p+48), c64); \
592 _mm_stream_pi((__m64 *)(p+56), c64); \
593 p += 64; \
594 }
595
596 #define MMX_END \
597 _mm_empty()
598
599 #define DEFINE_MMX_FILLRECT(bpp, type) \
600 static void \
601 SDL_FillRect##bpp##MMX(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
602 { \
603 MMX_BEGIN; \
604 \
605 while (h--) { \
606 int i, n = w * bpp; \
607 Uint8 *p = pixels; \
608 \
609 if (n > 7) { \
610 int adjust = 8 - ((uintptr_t)p & 7); \
611 if (adjust < 8) { \
612 n -= adjust; \
613 adjust /= bpp; \
614 while(adjust--) { \
615 *((type *)p) = (type)color; \
616 p += bpp; \
617 } \
618 } \
619 MMX_WORK; \
620 } \
621 if (n & 63) { \
622 int remainder = (n & 63); \
623 remainder /= bpp; \
624 while(remainder--) { \
625 *((type *)p) = (type)color; \
626 p += bpp; \
627 } \
628 } \
629 pixels += pitch; \
630 } \
631 \
632 MMX_END; \
633 }
634
635 DEFINE_MMX_FILLRECT(1, Uint8)
636 DEFINE_MMX_FILLRECT(2, Uint16)
637 DEFINE_MMX_FILLRECT(4, Uint32)
638
639 /* *INDENT-ON* */
640 #endif /* __MMX__ */
641
642 static void
643 SDL_FillRect1(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
644 {
645 while (h--) {
646 int n = w;
647 Uint8 *p = pixels;
648
649 if (n > 3) {
650 switch ((uintptr_t) p & 3) {
651 case 1:
652 *p++ = (Uint8) color;
653 --n;
654 case 2:
655 *p++ = (Uint8) color;
656 --n;
657 case 3:
658 *p++ = (Uint8) color;
659 --n;
660 }
661 SDL_memset4(p, color, (n >> 2));
662 }
663 if (n & 3) {
664 p += (n & ~3);
665 switch (n & 3) {
666 case 3:
667 *p++ = (Uint8) color;
668 case 2:
669 *p++ = (Uint8) color;
670 case 1:
671 *p++ = (Uint8) color;
672 }
673 }
674 pixels += pitch;
675 }
676 }
677
678 static void
679 SDL_FillRect2(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
680 {
681 while (h--) {
682 int n = w;
683 Uint16 *p = (Uint16 *) pixels;
684
685 if (n > 1) {
686 if ((uintptr_t) p & 2) {
687 *p++ = (Uint16) color;
688 --n;
689 }
690 SDL_memset4(p, color, (n >> 1));
691 }
692 if (n & 1) {
693 p[n - 1] = (Uint16) color;
694 }
695 pixels += pitch;
696 }
697 }
698
699 static void
700 SDL_FillRect3(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
701 {
702 Uint8 r = (Uint8) (color & 0xFF);
703 Uint8 g = (Uint8) ((color >> 8) & 0xFF);
704 Uint8 b = (Uint8) ((color >> 16) & 0xFF);
705
706 while (h--) {
707 int n = w;
708 Uint8 *p = pixels;
709
710 while (n--) {
711 *p++ = r;
712 *p++ = g;
713 *p++ = b;
714 }
715 pixels += pitch;
716 }
717 }
718
719 static void
720 SDL_FillRect4(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
721 {
722 while (h--) {
723 SDL_memset4(pixels, color, w);
724 pixels += pitch;
725 }
526 } 726 }
527 727
528 /* 728 /*
529 * This function performs a fast fill of the given rectangle with 'color' 729 * This function performs a fast fill of the given rectangle with 'color'
530 */ 730 */
531 int 731 int
532 SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) 732 SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
533 { 733 {
534 int x, y; 734 Uint8 *pixels;
535 Uint8 *row;
536 735
537 /* This function doesn't work on surfaces < 8 bpp */ 736 /* This function doesn't work on surfaces < 8 bpp */
538 if (dst->format->BitsPerPixel < 8) { 737 if (dst->format->BitsPerPixel < 8) {
539 switch (dst->format->BitsPerPixel) { 738 SDL_SetError("Fill rect on unsupported surface format");
540 case 1: 739 return (-1);
541 return SDL_FillRect1(dst, dstrect, color);
542 break;
543 case 4:
544 return SDL_FillRect4(dst, dstrect, color);
545 break;
546 default:
547 SDL_SetError("Fill rect on unsupported surface format");
548 return (-1);
549 break;
550 }
551 } 740 }
552 741
553 /* If 'dstrect' == NULL, then fill the whole surface */ 742 /* If 'dstrect' == NULL, then fill the whole surface */
554 if (dstrect) { 743 if (dstrect) {
555 /* Perform clipping */ 744 /* Perform clipping */
562 751
563 /* Perform software fill */ 752 /* Perform software fill */
564 if (SDL_LockSurface(dst) != 0) { 753 if (SDL_LockSurface(dst) != 0) {
565 return (-1); 754 return (-1);
566 } 755 }
567 row = (Uint8 *) dst->pixels + dstrect->y * dst->pitch + 756
757 pixels =
758 (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
568 dstrect->x * dst->format->BytesPerPixel; 759 dstrect->x * dst->format->BytesPerPixel;
569 if (dst->format->palette || (color == 0)) { 760
570 x = dstrect->w * dst->format->BytesPerPixel; 761 switch (dst->format->BytesPerPixel) {
571 #ifndef __MACOSX__ /* memset() is optimized on Mac OS X */ 762 case 1:
572 if (!color && !((uintptr_t) row & 3) && !(x & 3)
573 && !(dst->pitch & 3)) {
574 int n = x >> 2;
575 for (y = dstrect->h; y; --y) {
576 SDL_memset4(row, 0, n);
577 row += dst->pitch;
578 }
579 } else
580 #endif /* !__MACOSX__ */
581 { 763 {
582 for (y = dstrect->h; y; y--) { 764 color |= (color << 8);
583 SDL_memset(row, color, x); 765 color |= (color << 16);
584 row += dst->pitch; 766 #ifdef __SSE__
585 } 767 if (SDL_HasSSE()) {
586 } 768 SDL_FillRect1SSE(pixels, dst->pitch, color, dstrect->w,
587 } else { 769 dstrect->h);
588 switch (dst->format->BytesPerPixel) { 770 break;
589 case 2: 771 }
590 { 772 #endif
591 Uint16 c = (Uint16) color; 773 #ifdef __MMX__
592 Uint32 cc = (Uint32) c << 16 | c; 774 if (SDL_HasMMX()) {
593 for (y = dstrect->h; y; --y) { 775 SDL_FillRect1MMX(pixels, dst->pitch, color, dstrect->w,
594 Uint16 *pixels = (Uint16 *) row; 776 dstrect->h);
595 int n = dstrect->w; 777 break;
596 if ((uintptr_t) pixels & 3) { 778 }
597 *pixels++ = c; 779 #endif
598 n--; 780 SDL_FillRect1(pixels, dst->pitch, color, dstrect->w, dstrect->h);
599 }
600 if (n >> 1)
601 SDL_memset4(pixels, cc, n >> 1);
602 if (n & 1)
603 pixels[n - 1] = c;
604 row += dst->pitch;
605 }
606 }
607 break; 781 break;
608 782 }
609 case 3: 783
610 #if SDL_BYTEORDER == SDL_BIG_ENDIAN 784 case 2:
611 color <<= 8; 785 {
786 color |= (color << 16);
787 #ifdef __SSE__
788 if (SDL_HasSSE()) {
789 SDL_FillRect2SSE(pixels, dst->pitch, color, dstrect->w,
790 dstrect->h);
791 break;
792 }
612 #endif 793 #endif
613 for (y = dstrect->h; y; --y) { 794 #ifdef __MMX__
614 Uint8 *pixels = row; 795 if (SDL_HasMMX()) {
615 for (x = dstrect->w; x; --x) { 796 SDL_FillRect2MMX(pixels, dst->pitch, color, dstrect->w,
616 SDL_memcpy(pixels, &color, 3); 797 dstrect->h);
617 pixels += 3; 798 break;
618 } 799 }
619 row += dst->pitch; 800 #endif
620 } 801 SDL_FillRect2(pixels, dst->pitch, color, dstrect->w, dstrect->h);
621 break; 802 break;
622 803 }
623 case 4: 804
624 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES 805 case 3:
625 if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) { 806 /* 24-bit RGB is a slow path, at least for now. */
626 Uint32 cccc[4] __attribute__ ((aligned(16))) = { 807 {
627 color, color, color, color}; 808 SDL_FillRect3(pixels, dst->pitch, color, dstrect->w, dstrect->h);
628 int i, n = dstrect->w / 4; 809 break;
629 __asm__ __volatile__(" movdqa (%0), %%xmm0\n":: 810 }
630 "r"(cccc):"memory"); 811
631 for (y = dstrect->h; y; --y) { 812 case 4:
632 Uint8 *pixels = row; 813 {
633 for (i = n / 2; i--;) { 814 #ifdef __SSE__
634 /* *INDENT-OFF* */ 815 if (SDL_HasSSE()) {
635 __asm__ __volatile__(" prefetchnta 256(%0)\n" 816 SDL_FillRect4SSE(pixels, dst->pitch, color, dstrect->w,
636 " movdqa %%xmm0, (%0)\n" 817 dstrect->h);
637 " movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory");
638 /* *INDENT-ON* */
639 pixels += 32;
640 }
641 if (n & 1) {
642 __asm__ __volatile__(" movdqa %%xmm0, (%0)\n"::
643 "r"(pixels):"memory");
644 }
645 row += dst->pitch;
646 }
647 __asm__ __volatile__(" emms\n"::);
648 break; 818 break;
649 } 819 }
650 #endif 820 #endif
651 for (y = dstrect->h; y; --y) { 821 #ifdef __MMX__
652 SDL_memset4(row, color, dstrect->w); 822 if (SDL_HasMMX()) {
653 row += dst->pitch; 823 SDL_FillRect4MMX(pixels, dst->pitch, color, dstrect->w,
654 } 824 dstrect->h);
825 break;
826 }
827 #endif
828 SDL_FillRect4(pixels, dst->pitch, color, dstrect->w, dstrect->h);
655 break; 829 break;
656 } 830 }
657 } 831 }
832
658 SDL_UnlockSurface(dst); 833 SDL_UnlockSurface(dst);
659 834
660 /* We're done! */ 835 /* We're done! */
661 return (0); 836 return (0);
662 } 837 }