Mercurial > sdl-ios-xcode
comparison src/video/SDL_surface.c @ 2249:5a58b57b6724
Added SSE and MMX optimization for SDL_FillRect()
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Thu, 16 Aug 2007 05:56:24 +0000 |
parents | 31835fd24b2b |
children | 292bee385630 |
comparison
equal
deleted
inserted
replaced
2248:5cd2a2293cf0 | 2249:5a58b57b6724 |
---|---|
507 } | 507 } |
508 dstrect->w = dstrect->h = 0; | 508 dstrect->w = dstrect->h = 0; |
509 return 0; | 509 return 0; |
510 } | 510 } |
511 | 511 |
512 static int | 512 #ifdef __SSE__ |
513 SDL_FillRect1(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) | 513 /* *INDENT-OFF* */ |
514 { | 514 |
515 /* FIXME: We have to worry about packing order.. *sigh* */ | 515 #define SSE_BEGIN \ |
516 SDL_SetError("1-bpp rect fill not yet implemented"); | 516 DECLARE_ALIGNED(Uint32, cccc[4], 16); \ |
517 return -1; | 517 cccc[0] = color; \ |
518 } | 518 cccc[1] = color; \ |
519 | 519 cccc[2] = color; \ |
520 static int | 520 cccc[3] = color; \ |
521 SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) | 521 __m128 c128 = *(__m128 *)cccc; |
522 { | 522 |
523 /* FIXME: We have to worry about packing order.. *sigh* */ | 523 #define SSE_WORK \ |
524 SDL_SetError("4-bpp rect fill not yet implemented"); | 524 for (i = n / 64; i--;) { \ |
525 return -1; | 525 _mm_stream_ps((float *)(p+0), c128); \ |
526 _mm_stream_ps((float *)(p+16), c128); \ | |
527 _mm_stream_ps((float *)(p+32), c128); \ | |
528 _mm_stream_ps((float *)(p+48), c128); \ | |
529 p += 64; \ | |
530 } | |
531 | |
532 #define SSE_END | |
533 | |
534 #define DEFINE_SSE_FILLRECT(bpp, type) \ | |
535 static void \ | |
536 SDL_FillRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \ | |
537 { \ | |
538 SSE_BEGIN; \ | |
539 \ | |
540 while (h--) { \ | |
541 int i, n = w * bpp; \ | |
542 Uint8 *p = pixels; \ | |
543 \ | |
544 if (n > 15) { \ | |
545 int adjust = 16 - ((uintptr_t)p & 15); \ | |
546 if (adjust < 16) { \ | |
547 n -= adjust; \ | |
548 adjust /= bpp; \ | |
549 while(adjust--) { \ | |
550 *((type *)p) = (type)color; \ | |
551 p += bpp; \ | |
552 } \ | |
553 } \ | |
554 SSE_WORK; \ | |
555 } \ | |
556 if (n & 63) { \ | |
557 int remainder = (n & 63); \ | |
558 remainder /= bpp; \ | |
559 while(remainder--) { \ | |
560 *((type *)p) = (type)color; \ | |
561 p += bpp; \ | |
562 } \ | |
563 } \ | |
564 pixels += pitch; \ | |
565 } \ | |
566 \ | |
567 SSE_END; \ | |
568 } | |
569 | |
570 DEFINE_SSE_FILLRECT(1, Uint8) | |
571 DEFINE_SSE_FILLRECT(2, Uint16) | |
572 DEFINE_SSE_FILLRECT(4, Uint32) | |
573 | |
574 /* *INDENT-ON* */ | |
575 #endif /* __SSE__ */ | |
576 | |
577 #ifdef __MMX__ | |
578 /* *INDENT-OFF* */ | |
579 | |
580 #define MMX_BEGIN \ | |
581 __m64 c64 = _mm_set_pi32(color, color) | |
582 | |
583 #define MMX_WORK \ | |
584 for (i = n / 64; i--;) { \ | |
585 _mm_stream_pi((__m64 *)(p+0), c64); \ | |
586 _mm_stream_pi((__m64 *)(p+8), c64); \ | |
587 _mm_stream_pi((__m64 *)(p+16), c64); \ | |
588 _mm_stream_pi((__m64 *)(p+24), c64); \ | |
589 _mm_stream_pi((__m64 *)(p+32), c64); \ | |
590 _mm_stream_pi((__m64 *)(p+40), c64); \ | |
591 _mm_stream_pi((__m64 *)(p+48), c64); \ | |
592 _mm_stream_pi((__m64 *)(p+56), c64); \ | |
593 p += 64; \ | |
594 } | |
595 | |
596 #define MMX_END \ | |
597 _mm_empty() | |
598 | |
599 #define DEFINE_MMX_FILLRECT(bpp, type) \ | |
600 static void \ | |
601 SDL_FillRect##bpp##MMX(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \ | |
602 { \ | |
603 MMX_BEGIN; \ | |
604 \ | |
605 while (h--) { \ | |
606 int i, n = w * bpp; \ | |
607 Uint8 *p = pixels; \ | |
608 \ | |
609 if (n > 7) { \ | |
610 int adjust = 8 - ((uintptr_t)p & 7); \ | |
611 if (adjust < 8) { \ | |
612 n -= adjust; \ | |
613 adjust /= bpp; \ | |
614 while(adjust--) { \ | |
615 *((type *)p) = (type)color; \ | |
616 p += bpp; \ | |
617 } \ | |
618 } \ | |
619 MMX_WORK; \ | |
620 } \ | |
621 if (n & 63) { \ | |
622 int remainder = (n & 63); \ | |
623 remainder /= bpp; \ | |
624 while(remainder--) { \ | |
625 *((type *)p) = (type)color; \ | |
626 p += bpp; \ | |
627 } \ | |
628 } \ | |
629 pixels += pitch; \ | |
630 } \ | |
631 \ | |
632 MMX_END; \ | |
633 } | |
634 | |
635 DEFINE_MMX_FILLRECT(1, Uint8) | |
636 DEFINE_MMX_FILLRECT(2, Uint16) | |
637 DEFINE_MMX_FILLRECT(4, Uint32) | |
638 | |
639 /* *INDENT-ON* */ | |
640 #endif /* __MMX__ */ | |
641 | |
642 static void | |
643 SDL_FillRect1(Uint8 * pixels, int pitch, Uint32 color, int w, int h) | |
644 { | |
645 while (h--) { | |
646 int n = w; | |
647 Uint8 *p = pixels; | |
648 | |
649 if (n > 3) { | |
650 switch ((uintptr_t) p & 3) { | |
651 case 1: | |
652 *p++ = (Uint8) color; | |
653 --n; | |
654 case 2: | |
655 *p++ = (Uint8) color; | |
656 --n; | |
657 case 3: | |
658 *p++ = (Uint8) color; | |
659 --n; | |
660 } | |
661 SDL_memset4(p, color, (n >> 2)); | |
662 } | |
663 if (n & 3) { | |
664 p += (n & ~3); | |
665 switch (n & 3) { | |
666 case 3: | |
667 *p++ = (Uint8) color; | |
668 case 2: | |
669 *p++ = (Uint8) color; | |
670 case 1: | |
671 *p++ = (Uint8) color; | |
672 } | |
673 } | |
674 pixels += pitch; | |
675 } | |
676 } | |
677 | |
678 static void | |
679 SDL_FillRect2(Uint8 * pixels, int pitch, Uint32 color, int w, int h) | |
680 { | |
681 while (h--) { | |
682 int n = w; | |
683 Uint16 *p = (Uint16 *) pixels; | |
684 | |
685 if (n > 1) { | |
686 if ((uintptr_t) p & 2) { | |
687 *p++ = (Uint16) color; | |
688 --n; | |
689 } | |
690 SDL_memset4(p, color, (n >> 1)); | |
691 } | |
692 if (n & 1) { | |
693 p[n - 1] = (Uint16) color; | |
694 } | |
695 pixels += pitch; | |
696 } | |
697 } | |
698 | |
699 static void | |
700 SDL_FillRect3(Uint8 * pixels, int pitch, Uint32 color, int w, int h) | |
701 { | |
702 Uint8 r = (Uint8) (color & 0xFF); | |
703 Uint8 g = (Uint8) ((color >> 8) & 0xFF); | |
704 Uint8 b = (Uint8) ((color >> 16) & 0xFF); | |
705 | |
706 while (h--) { | |
707 int n = w; | |
708 Uint8 *p = pixels; | |
709 | |
710 while (n--) { | |
711 *p++ = r; | |
712 *p++ = g; | |
713 *p++ = b; | |
714 } | |
715 pixels += pitch; | |
716 } | |
717 } | |
718 | |
719 static void | |
720 SDL_FillRect4(Uint8 * pixels, int pitch, Uint32 color, int w, int h) | |
721 { | |
722 while (h--) { | |
723 SDL_memset4(pixels, color, w); | |
724 pixels += pitch; | |
725 } | |
526 } | 726 } |
527 | 727 |
528 /* | 728 /* |
529 * This function performs a fast fill of the given rectangle with 'color' | 729 * This function performs a fast fill of the given rectangle with 'color' |
530 */ | 730 */ |
531 int | 731 int |
532 SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) | 732 SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color) |
533 { | 733 { |
534 int x, y; | 734 Uint8 *pixels; |
535 Uint8 *row; | |
536 | 735 |
537 /* This function doesn't work on surfaces < 8 bpp */ | 736 /* This function doesn't work on surfaces < 8 bpp */ |
538 if (dst->format->BitsPerPixel < 8) { | 737 if (dst->format->BitsPerPixel < 8) { |
539 switch (dst->format->BitsPerPixel) { | 738 SDL_SetError("Fill rect on unsupported surface format"); |
540 case 1: | 739 return (-1); |
541 return SDL_FillRect1(dst, dstrect, color); | |
542 break; | |
543 case 4: | |
544 return SDL_FillRect4(dst, dstrect, color); | |
545 break; | |
546 default: | |
547 SDL_SetError("Fill rect on unsupported surface format"); | |
548 return (-1); | |
549 break; | |
550 } | |
551 } | 740 } |
552 | 741 |
553 /* If 'dstrect' == NULL, then fill the whole surface */ | 742 /* If 'dstrect' == NULL, then fill the whole surface */ |
554 if (dstrect) { | 743 if (dstrect) { |
555 /* Perform clipping */ | 744 /* Perform clipping */ |
562 | 751 |
563 /* Perform software fill */ | 752 /* Perform software fill */ |
564 if (SDL_LockSurface(dst) != 0) { | 753 if (SDL_LockSurface(dst) != 0) { |
565 return (-1); | 754 return (-1); |
566 } | 755 } |
567 row = (Uint8 *) dst->pixels + dstrect->y * dst->pitch + | 756 |
757 pixels = | |
758 (Uint8 *) dst->pixels + dstrect->y * dst->pitch + | |
568 dstrect->x * dst->format->BytesPerPixel; | 759 dstrect->x * dst->format->BytesPerPixel; |
569 if (dst->format->palette || (color == 0)) { | 760 |
570 x = dstrect->w * dst->format->BytesPerPixel; | 761 switch (dst->format->BytesPerPixel) { |
571 #ifndef __MACOSX__ /* memset() is optimized on Mac OS X */ | 762 case 1: |
572 if (!color && !((uintptr_t) row & 3) && !(x & 3) | |
573 && !(dst->pitch & 3)) { | |
574 int n = x >> 2; | |
575 for (y = dstrect->h; y; --y) { | |
576 SDL_memset4(row, 0, n); | |
577 row += dst->pitch; | |
578 } | |
579 } else | |
580 #endif /* !__MACOSX__ */ | |
581 { | 763 { |
582 for (y = dstrect->h; y; y--) { | 764 color |= (color << 8); |
583 SDL_memset(row, color, x); | 765 color |= (color << 16); |
584 row += dst->pitch; | 766 #ifdef __SSE__ |
585 } | 767 if (SDL_HasSSE()) { |
586 } | 768 SDL_FillRect1SSE(pixels, dst->pitch, color, dstrect->w, |
587 } else { | 769 dstrect->h); |
588 switch (dst->format->BytesPerPixel) { | 770 break; |
589 case 2: | 771 } |
590 { | 772 #endif |
591 Uint16 c = (Uint16) color; | 773 #ifdef __MMX__ |
592 Uint32 cc = (Uint32) c << 16 | c; | 774 if (SDL_HasMMX()) { |
593 for (y = dstrect->h; y; --y) { | 775 SDL_FillRect1MMX(pixels, dst->pitch, color, dstrect->w, |
594 Uint16 *pixels = (Uint16 *) row; | 776 dstrect->h); |
595 int n = dstrect->w; | 777 break; |
596 if ((uintptr_t) pixels & 3) { | 778 } |
597 *pixels++ = c; | 779 #endif |
598 n--; | 780 SDL_FillRect1(pixels, dst->pitch, color, dstrect->w, dstrect->h); |
599 } | |
600 if (n >> 1) | |
601 SDL_memset4(pixels, cc, n >> 1); | |
602 if (n & 1) | |
603 pixels[n - 1] = c; | |
604 row += dst->pitch; | |
605 } | |
606 } | |
607 break; | 781 break; |
608 | 782 } |
609 case 3: | 783 |
610 #if SDL_BYTEORDER == SDL_BIG_ENDIAN | 784 case 2: |
611 color <<= 8; | 785 { |
786 color |= (color << 16); | |
787 #ifdef __SSE__ | |
788 if (SDL_HasSSE()) { | |
789 SDL_FillRect2SSE(pixels, dst->pitch, color, dstrect->w, | |
790 dstrect->h); | |
791 break; | |
792 } | |
612 #endif | 793 #endif |
613 for (y = dstrect->h; y; --y) { | 794 #ifdef __MMX__ |
614 Uint8 *pixels = row; | 795 if (SDL_HasMMX()) { |
615 for (x = dstrect->w; x; --x) { | 796 SDL_FillRect2MMX(pixels, dst->pitch, color, dstrect->w, |
616 SDL_memcpy(pixels, &color, 3); | 797 dstrect->h); |
617 pixels += 3; | 798 break; |
618 } | 799 } |
619 row += dst->pitch; | 800 #endif |
620 } | 801 SDL_FillRect2(pixels, dst->pitch, color, dstrect->w, dstrect->h); |
621 break; | 802 break; |
622 | 803 } |
623 case 4: | 804 |
624 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES | 805 case 3: |
625 if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) { | 806 /* 24-bit RGB is a slow path, at least for now. */ |
626 Uint32 cccc[4] __attribute__ ((aligned(16))) = { | 807 { |
627 color, color, color, color}; | 808 SDL_FillRect3(pixels, dst->pitch, color, dstrect->w, dstrect->h); |
628 int i, n = dstrect->w / 4; | 809 break; |
629 __asm__ __volatile__(" movdqa (%0), %%xmm0\n":: | 810 } |
630 "r"(cccc):"memory"); | 811 |
631 for (y = dstrect->h; y; --y) { | 812 case 4: |
632 Uint8 *pixels = row; | 813 { |
633 for (i = n / 2; i--;) { | 814 #ifdef __SSE__ |
634 /* *INDENT-OFF* */ | 815 if (SDL_HasSSE()) { |
635 __asm__ __volatile__(" prefetchnta 256(%0)\n" | 816 SDL_FillRect4SSE(pixels, dst->pitch, color, dstrect->w, |
636 " movdqa %%xmm0, (%0)\n" | 817 dstrect->h); |
637 " movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory"); | |
638 /* *INDENT-ON* */ | |
639 pixels += 32; | |
640 } | |
641 if (n & 1) { | |
642 __asm__ __volatile__(" movdqa %%xmm0, (%0)\n":: | |
643 "r"(pixels):"memory"); | |
644 } | |
645 row += dst->pitch; | |
646 } | |
647 __asm__ __volatile__(" emms\n"::); | |
648 break; | 818 break; |
649 } | 819 } |
650 #endif | 820 #endif |
651 for (y = dstrect->h; y; --y) { | 821 #ifdef __MMX__ |
652 SDL_memset4(row, color, dstrect->w); | 822 if (SDL_HasMMX()) { |
653 row += dst->pitch; | 823 SDL_FillRect4MMX(pixels, dst->pitch, color, dstrect->w, |
654 } | 824 dstrect->h); |
825 break; | |
826 } | |
827 #endif | |
828 SDL_FillRect4(pixels, dst->pitch, color, dstrect->w, dstrect->h); | |
655 break; | 829 break; |
656 } | 830 } |
657 } | 831 } |
832 | |
658 SDL_UnlockSurface(dst); | 833 SDL_UnlockSurface(dst); |
659 | 834 |
660 /* We're done! */ | 835 /* We're done! */ |
661 return (0); | 836 return (0); |
662 } | 837 } |