# HG changeset patch # User Sam Lantinga # Date 1231831255 0 # Node ID ff602fdfdedc670e3b90d7f350d2282372203a0b # Parent 0e821769fc5146cb2e27ae0b2e149f913c0bb197 Removed Rafal Bursig's MMX RLE code, at his request. diff -r 0e821769fc51 -r ff602fdfdedc src/video/SDL_RLEaccel.c --- a/src/video/SDL_RLEaccel.c Tue Jan 13 03:53:22 2009 +0000 +++ b/src/video/SDL_RLEaccel.c Tue Jan 13 07:20:55 2009 +0000 @@ -91,15 +91,6 @@ #include "SDL_blit.h" #include "SDL_RLEaccel_c.h" -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES -#define MMX_ASMBLIT -#endif - -#ifdef MMX_ASMBLIT -#include "mmx.h" -#include "SDL_cpuinfo.h" -#endif - #ifndef MAX #define MAX(a, b) ((a) > (b) ? (a) : (b)) #endif @@ -123,262 +114,6 @@ #define OPAQUE_BLIT(to, from, length, bpp, alpha) \ PIXEL_COPY(to, from, length, bpp) -#ifdef MMX_ASMBLIT - -#define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha) \ - do { \ - Uint32 *srcp = (Uint32 *)(from); \ - Uint32 *dstp = (Uint32 *)(to); \ - int i = 0x00FF00FF; \ - movd_m2r(*(&i), mm3); \ - punpckldq_r2r(mm3, mm3); \ - i = 0xFF000000; \ - movd_m2r(*(&i), mm7); \ - punpckldq_r2r(mm7, mm7); \ - i = alpha | alpha << 16; \ - movd_m2r(*(&i), mm4); \ - punpckldq_r2r(mm4, mm4); \ - pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */ \ - pxor_r2r(mm7, mm5); /* make clear alpha mask */ \ - i = length; \ - if(i & 1) { \ - movd_m2r((*srcp), mm1); /* src -> mm1 */ \ - punpcklbw_r2r(mm1, mm1); \ - pand_r2r(mm3, mm1); \ - movd_m2r((*dstp), mm2); /* dst -> mm2 */ \ - punpcklbw_r2r(mm2, mm2); \ - pand_r2r(mm3, mm2); \ - psubw_r2r(mm2, mm1); \ - pmullw_r2r(mm4, mm1); \ - psrlw_i2r(8, mm1); \ - paddw_r2r(mm1, mm2); \ - pand_r2r(mm3, mm2); \ - packuswb_r2r(mm2, mm2); \ - pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */ \ - movd_r2m(mm2, *dstp); \ - ++srcp; \ - ++dstp; \ - i--; \ - } \ - for(; i > 0; --i) { \ - movq_m2r((*srcp), mm0); \ - movq_r2r(mm0, mm1); \ - punpcklbw_r2r(mm0, mm0); \ - movq_m2r((*dstp), mm2); \ - punpckhbw_r2r(mm1, mm1); \ - movq_r2r(mm2, mm6); \ - pand_r2r(mm3, mm0); \ - punpcklbw_r2r(mm2, mm2); \ - pand_r2r(mm3, mm1); \ - punpckhbw_r2r(mm6, mm6); \ - pand_r2r(mm3, mm2); \ - psubw_r2r(mm2, mm0); \ - pmullw_r2r(mm4, mm0); \ - pand_r2r(mm3, mm6); \ - psubw_r2r(mm6, mm1); \ - pmullw_r2r(mm4, mm1); \ - psrlw_i2r(8, mm0); \ - paddw_r2r(mm0, mm2); \ - psrlw_i2r(8, mm1); \ - paddw_r2r(mm1, mm6); \ - pand_r2r(mm3, mm2); \ - pand_r2r(mm3, mm6); \ - packuswb_r2r(mm2, mm2); \ - packuswb_r2r(mm6, mm6); \ - psrlq_i2r(32, mm2); \ - psllq_i2r(32, mm6); \ - por_r2r(mm6, mm2); \ - pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */ \ - movq_r2m(mm2, *dstp); \ - srcp += 2; \ - dstp += 2; \ - i--; \ - } \ - emms(); \ - } while(0) - -#define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha) \ - do { \ - int i, n = 0; \ - Uint16 *srcp = (Uint16 *)(from); \ - Uint16 *dstp = (Uint16 *)(to); \ - Uint32 ALPHA = 0xF800; \ - movd_m2r(*(&ALPHA), mm1); \ - punpcklwd_r2r(mm1, mm1); \ - punpcklwd_r2r(mm1, mm1); \ - ALPHA = 0x07E0; \ - movd_m2r(*(&ALPHA), mm4); \ - punpcklwd_r2r(mm4, mm4); \ - punpcklwd_r2r(mm4, mm4); \ - ALPHA = 0x001F; \ - movd_m2r(*(&ALPHA), mm7); \ - punpcklwd_r2r(mm7, mm7); \ - punpcklwd_r2r(mm7, mm7); \ - alpha &= ~(1+2+4); \ - i = (Uint32)alpha | (Uint32)alpha << 16; \ - movd_m2r(*(&i), mm0); \ - punpckldq_r2r(mm0, mm0); \ - ALPHA = alpha >> 3; \ - i = ((int)(length) & 3); \ - for(; i > 0; --i) { \ - Uint32 s = *srcp++; \ - Uint32 d = *dstp; \ - s = (s | s << 16) & 0x07e0f81f; \ - d = (d | d << 16) & 0x07e0f81f; \ - d += (s - d) * ALPHA >> 5; \ - d &= 0x07e0f81f; \ - *dstp++ = d | d >> 16; \ - n++; \ - } \ - i = (int)(length) - n; \ - for(; i > 0; --i) { \ - movq_m2r((*dstp), mm3); \ - movq_m2r((*srcp), mm2); \ - movq_r2r(mm2, mm5); \ - pand_r2r(mm1 , mm5); \ - psrlq_i2r(11, mm5); \ - movq_r2r(mm3, mm6); \ - pand_r2r(mm1 , mm6); \ - psrlq_i2r(11, mm6); \ - psubw_r2r(mm6, mm5); \ - pmullw_r2r(mm0, mm5); \ - psrlw_i2r(8, mm5); \ - paddw_r2r(mm5, mm6); \ - psllq_i2r(11, mm6); \ - pand_r2r(mm1, mm6); \ - movq_r2r(mm4, mm5); \ - por_r2r(mm7, mm5); \ - pand_r2r(mm5, mm3); \ - por_r2r(mm6, mm3); \ - movq_r2r(mm2, mm5); \ - pand_r2r(mm4 , mm5); \ - psrlq_i2r(5, mm5); \ - movq_r2r(mm3, mm6); \ - pand_r2r(mm4 , mm6); \ - psrlq_i2r(5, mm6); \ - psubw_r2r(mm6, mm5); \ - pmullw_r2r(mm0, mm5); \ - psrlw_i2r(8, mm5); \ - paddw_r2r(mm5, mm6); \ - psllq_i2r(5, mm6); \ - pand_r2r(mm4, mm6); \ - movq_r2r(mm1, mm5); \ - por_r2r(mm7, mm5); \ - pand_r2r(mm5, mm3); \ - por_r2r(mm6, mm3); \ - movq_r2r(mm2, mm5); \ - pand_r2r(mm7 , mm5); \ - movq_r2r(mm3, mm6); \ - pand_r2r(mm7 , mm6); \ - psubw_r2r(mm6, mm5); \ - pmullw_r2r(mm0, mm5); \ - psrlw_i2r(8, mm5); \ - paddw_r2r(mm5, mm6); \ - pand_r2r(mm7, mm6); \ - movq_r2r(mm1, mm5); \ - por_r2r(mm4, mm5); \ - pand_r2r(mm5, mm3); \ - por_r2r(mm6, mm3); \ - movq_r2m(mm3, *dstp); \ - srcp += 4; \ - dstp += 4; \ - i -= 3; \ - } \ - emms(); \ - } while(0) - -#define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha) \ - do { \ - int i, n = 0; \ - Uint16 *srcp = (Uint16 *)(from); \ - Uint16 *dstp = (Uint16 *)(to); \ - Uint32 ALPHA = 0x7C00; \ - movd_m2r(*(&ALPHA), mm1); \ - punpcklwd_r2r(mm1, mm1); \ - punpcklwd_r2r(mm1, mm1); \ - ALPHA = 0x03E0; \ - movd_m2r(*(&ALPHA), mm4); \ - punpcklwd_r2r(mm4, mm4); \ - punpcklwd_r2r(mm4, mm4); \ - ALPHA = 0x001F; \ - movd_m2r(*(&ALPHA), mm7); \ - punpcklwd_r2r(mm7, mm7); \ - punpcklwd_r2r(mm7, mm7); \ - alpha &= ~(1+2+4); \ - i = (Uint32)alpha | (Uint32)alpha << 16; \ - movd_m2r(*(&i), mm0); \ - punpckldq_r2r(mm0, mm0); \ - i = ((int)(length) & 3); \ - ALPHA = alpha >> 3; \ - for(; i > 0; --i) { \ - Uint32 s = *srcp++; \ - Uint32 d = *dstp; \ - s = (s | s << 16) & 0x03e07c1f; \ - d = (d | d << 16) & 0x03e07c1f; \ - d += (s - d) * ALPHA >> 5; \ - d &= 0x03e07c1f; \ - *dstp++ = d | d >> 16; \ - n++; \ - } \ - i = (int)(length) - n; \ - for(; i > 0; --i) { \ - movq_m2r((*dstp), mm3); \ - movq_m2r((*srcp), mm2); \ - movq_r2r(mm2, mm5); \ - pand_r2r(mm1 , mm5); \ - psrlq_i2r(10, mm5); \ - movq_r2r(mm3, mm6); \ - pand_r2r(mm1 , mm6); \ - psrlq_i2r(10, mm6); \ - psubw_r2r(mm6, mm5); \ - pmullw_r2r(mm0, mm5); \ - psrlw_i2r(8, mm5); \ - paddw_r2r(mm5, mm6); \ - psllq_i2r(10, mm6); \ - pand_r2r(mm1, mm6); \ - movq_r2r(mm4, mm5); \ - por_r2r(mm7, mm5); \ - pand_r2r(mm5, mm3); \ - por_r2r(mm6, mm3); \ - movq_r2r(mm2, mm5); \ - pand_r2r(mm4 , mm5); \ - psrlq_i2r(5, mm5); \ - movq_r2r(mm3, mm6); \ - pand_r2r(mm4 , mm6); \ - psrlq_i2r(5, mm6); \ - psubw_r2r(mm6, mm5); \ - pmullw_r2r(mm0, mm5); \ - psrlw_i2r(8, mm5); \ - paddw_r2r(mm5, mm6); \ - psllq_i2r(5, mm6); \ - pand_r2r(mm4, mm6); \ - movq_r2r(mm1, mm5); \ - por_r2r(mm7, mm5); \ - pand_r2r(mm5, mm3); \ - por_r2r(mm6, mm3); \ - movq_r2r(mm2, mm5); \ - pand_r2r(mm7 , mm5); \ - movq_r2r(mm3, mm6); \ - pand_r2r(mm7 , mm6); \ - psubw_r2r(mm6, mm5); \ - pmullw_r2r(mm0, mm5); \ - psrlw_i2r(8, mm5); \ - paddw_r2r(mm5, mm6); \ - pand_r2r(mm7, mm6); \ - movq_r2r(mm1, mm5); \ - por_r2r(mm4, mm5); \ - pand_r2r(mm5, mm3); \ - por_r2r(mm6, mm3); \ - movq_r2m(mm3, *dstp); \ - srcp += 4; \ - dstp += 4; \ - i -= 3; \ - } \ - emms(); \ - } while(0) - -#endif - /* * For 32bpp pixels on the form 0x00rrggbb: * If we treat the middle component separately, we can process the two @@ -504,48 +239,6 @@ } \ } while(0) -#ifdef MMX_ASMBLIT - -#define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha) \ - do { \ - Uint32 *srcp = (Uint32 *)(from); \ - Uint32 *dstp = (Uint32 *)(to); \ - int i = 0x00fefefe; \ - movd_m2r(*(&i), mm4); \ - punpckldq_r2r(mm4, mm4); \ - i = 0x00010101; \ - movd_m2r(*(&i), mm3); \ - punpckldq_r2r(mm3, mm3); \ - i = (int)(length); \ - if( i & 1 ) { \ - Uint32 s = *srcp++; \ - Uint32 d = *dstp; \ - *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) \ - + (s & d & 0x00010101); \ - i--; \ - } \ - for(; i > 0; --i) { \ - movq_m2r((*dstp), mm2); /* dst -> mm2 */ \ - movq_r2r(mm2, mm6); /* dst -> mm6 */ \ - movq_m2r((*srcp), mm1); /* src -> mm1 */ \ - movq_r2r(mm1, mm5); /* src -> mm5 */ \ - pand_r2r(mm4, mm6); /* dst & 0x00fefefe -> mm6 */ \ - pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */ \ - paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */ \ - psrld_i2r(1, mm5); \ - pand_r2r(mm1, mm2); /* s & d -> mm2 */ \ - pand_r2r(mm3, mm2); /* s & d & 0x00010101 -> mm2 */ \ - paddd_r2r(mm5, mm2); \ - movq_r2m(mm2, (*dstp)); \ - dstp += 2; \ - srcp += 2; \ - i--; \ - } \ - emms(); \ - } while(0) - -#endif - /* * Special case: 50% alpha (alpha=128) * This is treated specially because it can be optimized very well, and @@ -617,94 +310,6 @@ #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha) \ ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde) -#ifdef MMX_ASMBLIT - -#define CHOOSE_BLIT(blitter, alpha, fmt) \ - do { \ - if(alpha == 255) { \ - switch(fmt->BytesPerPixel) { \ - case 1: blitter(1, Uint8, OPAQUE_BLIT); break; \ - case 2: blitter(2, Uint8, OPAQUE_BLIT); break; \ - case 3: blitter(3, Uint8, OPAQUE_BLIT); break; \ - case 4: blitter(4, Uint16, OPAQUE_BLIT); break; \ - } \ - } else { \ - switch(fmt->BytesPerPixel) { \ - case 1: \ - /* No 8bpp alpha blitting */ \ - break; \ - \ - case 2: \ - switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) { \ - case 0xffff: \ - if(fmt->Gmask == 0x07e0 \ - || fmt->Rmask == 0x07e0 \ - || fmt->Bmask == 0x07e0) { \ - if(alpha == 128) \ - blitter(2, Uint8, ALPHA_BLIT16_565_50); \ - else { \ - if(SDL_HasMMX()) \ - blitter(2, Uint8, ALPHA_BLIT16_565MMX); \ - else \ - blitter(2, Uint8, ALPHA_BLIT16_565); \ - } \ - } else \ - goto general16; \ - break; \ - \ - case 0x7fff: \ - if(fmt->Gmask == 0x03e0 \ - || fmt->Rmask == 0x03e0 \ - || fmt->Bmask == 0x03e0) { \ - if(alpha == 128) \ - blitter(2, Uint8, ALPHA_BLIT16_555_50); \ - else { \ - if(SDL_HasMMX()) \ - blitter(2, Uint8, ALPHA_BLIT16_555MMX); \ - else \ - blitter(2, Uint8, ALPHA_BLIT16_555); \ - } \ - break; \ - } \ - /* fallthrough */ \ - \ - default: \ - general16: \ - blitter(2, Uint8, ALPHA_BLIT_ANY); \ - } \ - break; \ - \ - case 3: \ - blitter(3, Uint8, ALPHA_BLIT_ANY); \ - break; \ - \ - case 4: \ - if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff \ - && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00 \ - || fmt->Bmask == 0xff00)) { \ - if(alpha == 128) \ - { \ - if(SDL_HasMMX()) \ - blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\ - else \ - blitter(4, Uint16, ALPHA_BLIT32_888_50);\ - } \ - else \ - { \ - if(SDL_HasMMX()) \ - blitter(4, Uint16, ALPHA_BLIT32_888MMX);\ - else \ - blitter(4, Uint16, ALPHA_BLIT32_888); \ - } \ - } else \ - blitter(4, Uint16, ALPHA_BLIT_ANY); \ - break; \ - } \ - } \ - } while(0) - -#else - #define CHOOSE_BLIT(blitter, alpha, fmt) \ do { \ if(alpha == 255) { \ @@ -773,8 +378,6 @@ } \ } while(0) -#endif - /* * This takes care of the case when the surface is clipped on the left and/or * right. Top clipping has already been taken care of. diff -r 0e821769fc51 -r ff602fdfdedc src/video/SDL_blit.h --- a/src/video/SDL_blit.h Tue Jan 13 03:53:22 2009 +0000 +++ b/src/video/SDL_blit.h Tue Jan 13 07:20:55 2009 +0000 @@ -476,48 +476,7 @@ case 3: pixel_copy_increment; \ case 2: pixel_copy_increment; \ case 1: pixel_copy_increment; \ - } while ( --n > 0 ); \ - } \ -} - -/* 2 - times unrolled loop */ -#define DUFFS_LOOP_DOUBLE2(pixel_copy_increment, \ - double_pixel_copy_increment, width) \ -{ int n, w = width; \ - if( w & 1 ) { \ - pixel_copy_increment; \ - w--; \ - } \ - if ( w > 0 ) { \ - n = ( w + 2) / 4; \ - switch( w & 2 ) { \ - case 0: do { double_pixel_copy_increment; \ - case 2: double_pixel_copy_increment; \ - } while ( --n > 0 ); \ - } \ - } \ -} - -/* 2 - times unrolled loop 4 pixels */ -#define DUFFS_LOOP_QUATRO2(pixel_copy_increment, \ - double_pixel_copy_increment, \ - quatro_pixel_copy_increment, width) \ -{ int n, w = width; \ - if(w & 1) { \ - pixel_copy_increment; \ - w--; \ - } \ - if(w & 2) { \ - double_pixel_copy_increment; \ - w -= 2; \ - } \ - if ( w > 0 ) { \ - n = ( w + 7 ) / 8; \ - switch( w & 4 ) { \ - case 0: do { quatro_pixel_copy_increment; \ - case 4: quatro_pixel_copy_increment; \ - } while ( --n > 0 ); \ - } \ + } while (--n > 0); \ } \ } @@ -525,40 +484,28 @@ #define DUFFS_LOOP(pixel_copy_increment, width) \ DUFFS_LOOP8(pixel_copy_increment, width) -#else - -/* Don't use Duff's device to unroll loops */ -#define DUFFS_LOOP_DOUBLE2(pixel_copy_increment, \ - double_pixel_copy_increment, width) \ -{ int n = width; \ - if( n & 1 ) { \ - pixel_copy_increment; \ - n--; \ - } \ - n=n>>1; \ - for(; n > 0; --n) { \ - double_pixel_copy_increment; \ - } \ +/* Special version of Duff's device for even more optimization */ +#define DUFFS_LOOP_124(pixel_copy_increment1, \ + pixel_copy_increment2, \ + pixel_copy_increment4, width) \ +{ int n = width; \ + if (n & 1) { \ + pixel_copy_increment1; n -= 1; \ + } \ + if (n & 2) { \ + pixel_copy_increment2; n -= 2; \ + } \ + if (n) { \ + n = (n+7)/ 8; \ + switch (n & 4) { \ + case 0: do { pixel_copy_increment4; \ + case 4: pixel_copy_increment4; \ + } while (--n > 0); \ + } \ + } \ } -/* Don't use Duff's device to unroll loops */ -#define DUFFS_LOOP_QUATRO2(pixel_copy_increment, \ - double_pixel_copy_increment, \ - quatro_pixel_copy_increment, width) \ -{ int n = width; \ - if(n & 1) { \ - pixel_copy_increment; \ - n--; \ - } \ - if(n & 2) { \ - double_pixel_copy_increment; \ - n -= 2; \ - } \ - n=n>>2; \ - for(; n > 0; --n) { \ - quatro_pixel_copy_increment; \ - } \ -} +#else /* Don't use Duff's device to unroll loops */ #define DUFFS_LOOP(pixel_copy_increment, width) \ @@ -571,6 +518,10 @@ DUFFS_LOOP(pixel_copy_increment, width) #define DUFFS_LOOP4(pixel_copy_increment, width) \ DUFFS_LOOP(pixel_copy_increment, width) +#define DUFFS_LOOP_124(pixel_copy_increment1, \ + pixel_copy_increment2, \ + pixel_copy_increment4, width) \ + DUFFS_LOOP(pixel_copy_increment1, width) #endif /* USE_DUFFS_LOOP */ diff -r 0e821769fc51 -r ff602fdfdedc src/video/SDL_blit_A.c --- a/src/video/SDL_blit_A.c Tue Jan 13 03:53:22 2009 +0000 +++ b/src/video/SDL_blit_A.c Tue Jan 13 07:20:55 2009 +0000 @@ -1266,8 +1266,7 @@ while (height--) { /* *INDENT-OFF* */ - DUFFS_LOOP_DOUBLE2({ - /* One Pixel Blend */ + DUFFS_LOOP4({ s = *srcp; d = *dstp; s1 = s & 0xff00ff; @@ -1280,35 +1279,6 @@ *dstp = d1 | d | 0xff000000; ++srcp; ++dstp; - },{ - /* Two Pixels Blend */ - s = *srcp; - d = *dstp; - s1 = s & 0xff00ff; - d1 = d & 0xff00ff; - d1 += (s1 - d1) * alpha >> 8; - d1 &= 0xff00ff; - - s = ((s & 0xff00) >> 8) | - ((srcp[1] & 0xff00) << 8); - d = ((d & 0xff00) >> 8) | - ((dstp[1] & 0xff00) << 8); - d += (s - d) * alpha >> 8; - d &= 0x00ff00ff; - - *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000; - ++srcp; - - s1 = *srcp; - d1 = *dstp; - s1 &= 0xff00ff; - d1 &= 0xff00ff; - d1 += (s1 - d1) * alpha >> 8; - d1 &= 0xff00ff; - - *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000; - ++srcp; - ++dstp; }, width); /* *INDENT-ON* */ srcp += srcskip; @@ -1588,7 +1558,7 @@ while (height--) { /* *INDENT-OFF* */ - DUFFS_LOOP_QUATRO2( + DUFFS_LOOP_124( { s = *srcp++; d = *dstp; @@ -1726,7 +1696,7 @@ while (height--) { /* *INDENT-OFF* */ - DUFFS_LOOP_QUATRO2( + DUFFS_LOOP_124( { s = *srcp++; d = *dstp;