Mercurial > sdl-ios-xcode
diff src/video/SDL_blit_A.c @ 2255:17b2369756be
Use MMX intrinsics over GCC inline assembly
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Thu, 16 Aug 2007 22:18:53 +0000 |
parents | 6630fefab312 |
children | 340942cfda48 |
line wrap: on
line diff
--- a/src/video/SDL_blit_A.c Thu Aug 16 21:54:26 2007 +0000 +++ b/src/video/SDL_blit_A.c Thu Aug 16 22:18:53 2007 +0000 @@ -24,41 +24,6 @@ #include "SDL_video.h" #include "SDL_blit.h" -/* - In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on. - Checking if _mm_free is #defined in malloc.h is is the only way to - determine if the Processor Pack is installed, as far as I can tell. -*/ - -#if SDL_ASSEMBLY_ROUTINES -# if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) -# define MMX_ASMBLIT 1 -# define GCC_ASMBLIT 1 -# elif defined(_MSC_VER) && defined(_M_IX86) -# if (_MSC_VER <= 1200) -# include <malloc.h> -# if defined(_mm_free) -# define HAVE_MMINTRIN_H 1 -# endif -# else /* Visual Studio > VC6 always has mmintrin.h */ -# define HAVE_MMINTRIN_H 1 -# endif -# if HAVE_MMINTRIN_H -# define MMX_ASMBLIT 1 -# define MSVC_ASMBLIT 1 -# endif -# endif -#endif /* SDL_ASSEMBLY_ROUTINES */ - -/* Function to check the CPU flags */ -#include "SDL_cpuinfo.h" -#if GCC_ASMBLIT -#include "mmx.h" -#elif MSVC_ASMBLIT -#include <mmintrin.h> -#include <mm3dnow.h> -#endif - /* Functions to perform alpha blended blitting */ /* N->1 blending with per-surface alpha */ @@ -232,239 +197,8 @@ } } -#if GCC_ASMBLIT -/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ -static void -BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info) -{ - int width = info->d_width; - int height = info->d_height; - Uint32 *srcp = (Uint32 *) info->s_pixels; - int srcskip = info->s_skip >> 2; - Uint32 *dstp = (Uint32 *) info->d_pixels; - int dstskip = info->d_skip >> 2; - Uint32 dalpha = info->dst->Amask; - Uint8 load[8]; - - *(Uint64 *) load = 0x00fefefe00fefefeULL; /* alpha128 mask */ - movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */ - *(Uint64 *) load = 0x0001010100010101ULL; /* !alpha128 mask */ - movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */ - movd_m2r(dalpha, mm7); /* dst alpha mask */ - punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ - while (height--) { - /* *INDENT-OFF* */ - DUFFS_LOOP_DOUBLE2( - { - Uint32 s = *srcp++; - Uint32 d = *dstp; - *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) - + (s & d & 0x00010101)) | dalpha; - },{ - movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ - movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ - - movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ - movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ - - pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ - pand_r2r(mm4, mm5); /* src & mask -> mm5 */ - paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ - pand_r2r(mm1, mm2); /* src & dst -> mm2 */ - psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ - pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ - paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ - - por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ - movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ - dstp += 2; - srcp += 2; - }, width); - /* *INDENT-ON* */ - srcp += srcskip; - dstp += dstskip; - } - emms(); -} - -/* fast RGB888->(A)RGB888 blending with surface alpha */ -static void -BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info) -{ - SDL_PixelFormat *df = info->dst; - unsigned alpha = info->src->alpha; - - if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { - /* only call a128 version when R,G,B occupy lower bits */ - BlitRGBtoRGBSurfaceAlpha128MMX(info); - } else { - int width = info->d_width; - int height = info->d_height; - Uint32 *srcp = (Uint32 *) info->s_pixels; - int srcskip = info->s_skip >> 2; - Uint32 *dstp = (Uint32 *) info->d_pixels; - int dstskip = info->d_skip >> 2; - - pxor_r2r(mm5, mm5); /* 0 -> mm5 */ - /* form the alpha mult */ - movd_m2r(alpha, mm4); /* 0000000A -> mm4 */ - punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ - punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ - alpha = - (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df-> - Bshift); - movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */ - punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */ - pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */ - /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */ - movd_m2r(df->Amask, mm7); /* dst alpha mask */ - punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ - - while (height--) { - /* *INDENT-OFF* */ - DUFFS_LOOP_DOUBLE2({ - /* One Pixel Blend */ - movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ - movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ - punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */ - punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */ - - psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ - pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ - psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ - paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ - - packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */ - por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ - movd_r2m(mm2, *dstp);/* mm2 -> pixel */ - ++srcp; - ++dstp; - },{ - /* Two Pixels Blend */ - movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/ - movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ - movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ - movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ +#ifdef __MMX__ - punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */ - punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */ - punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */ - punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */ - - psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ - pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ - psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ - paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ - - psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ - pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ - psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ - paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ - - packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */ - por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */ - - movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */ - - srcp += 2; - dstp += 2; - }, width); - /* *INDENT-ON* */ - srcp += srcskip; - dstp += dstskip; - } - emms(); - } -} - -/* fast ARGB888->(A)RGB888 blending with pixel alpha */ -static void -BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info) -{ - int width = info->d_width; - int height = info->d_height; - Uint32 *srcp = (Uint32 *) info->s_pixels; - int srcskip = info->s_skip >> 2; - Uint32 *dstp = (Uint32 *) info->d_pixels; - int dstskip = info->d_skip >> 2; - SDL_PixelFormat *sf = info->src; - Uint32 amask = sf->Amask; - - pxor_r2r(mm6, mm6); /* 0 -> mm6 */ - /* form multiplication mask */ - movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */ - punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */ - pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */ - movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */ - pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */ - /* form channel masks */ - movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */ - packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */ - packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */ - pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */ - /* get alpha channel shift */ - /* *INDENT-OFF* */ - __asm__ __volatile__ ( - "movd %0, %%mm5" - : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */ - /* *INDENT-ON* */ - - while (height--) { - /* *INDENT-OFF* */ - DUFFS_LOOP4({ - Uint32 alpha = *srcp & amask; - /* FIXME: Here we special-case opaque alpha since the - compositioning used (>>8 instead of /255) doesn't handle - it correctly. Also special-case alpha=0 for speed? - Benchmark this! */ - if(alpha == 0) { - /* do nothing */ - } else if(alpha == amask) { - /* opaque alpha -- copy RGB, keep dst alpha */ - /* using MMX here to free up regular registers for other things */ - movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ - movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ - pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */ - pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */ - por_r2r(mm1, mm2); /* src | dst -> mm2 */ - movd_r2m(mm2, (*dstp)); /* mm2 -> dst */ - } else { - movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ - punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */ - - movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ - punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */ - - __asm__ __volatile__ ( - "movd %0, %%mm4" - : : "r" (alpha) ); /* 0000A000 -> mm4 */ - psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */ - punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ - punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ - pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */ - - /* blend */ - psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ - pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ - psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */ - paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ - - packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */ - movd_r2m(mm2, *dstp);/* mm2 -> dst */ - } - ++srcp; - ++dstp; - }, width); - /* *INDENT-ON* */ - srcp += srcskip; - dstp += dstskip; - } - emms(); -} - -/* End GCC_ASMBLIT */ - -#elif MSVC_ASMBLIT /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info) @@ -637,9 +371,9 @@ __m64 src1, dst1, mm_alpha, mm_zero, dmask; mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ - /* *INDENT-OFF* */ - multmask = ~(0xFFFFI64 << (ashift * 2)); - /* *INDENT-ON* */ + multmask = 0xFFFF; + multmask <<= (ashift * 2); + multmask = ~multmask; dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */ while (height--) { @@ -683,9 +417,7 @@ _mm_empty(); } -/* End MSVC_ASMBLIT */ - -#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ +#endif /* __MMX__ */ #if SDL_ALTIVEC_BLITTERS #if __MWERKS__ @@ -1639,123 +1371,7 @@ } } -#if GCC_ASMBLIT -/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ -static void -BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info) -{ - int width = info->d_width; - int height = info->d_height; - Uint32 *srcp = (Uint32 *) info->s_pixels; - int srcskip = info->s_skip >> 2; - Uint32 *dstp = (Uint32 *) info->d_pixels; - int dstskip = info->d_skip >> 2; - SDL_PixelFormat *sf = info->src; - Uint32 amask = sf->Amask; - - __asm__( - /* make mm6 all zeros. */ - "pxor %%mm6, %%mm6\n" - /* Make a mask to preserve the alpha. */ - "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */ - "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */ - "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */ - "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */ - "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */ - /* form channel masks */ - "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */ - "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */ - "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */ - "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */ - /* get alpha channel shift */ - "movd %1, %%mm5\n\t" /* Ashift -> mm5 */ - : /* nothing */ : "rm"(amask), "rm"((Uint32) sf->Ashift)); - - while (height--) { - - /* *INDENT-OFF* */ - DUFFS_LOOP4({ - Uint32 alpha; - - __asm__ ( - "prefetch 64(%0)\n" - "prefetch 64(%1)\n" - : : "r" (srcp), "r" (dstp) ); - - alpha = *srcp & amask; - /* FIXME: Here we special-case opaque alpha since the - compositioning used (>>8 instead of /255) doesn't handle - it correctly. Also special-case alpha=0 for speed? - Benchmark this! */ - if(alpha == 0) { - /* do nothing */ - } - else if(alpha == amask) { - /* opaque alpha -- copy RGB, keep dst alpha */ - /* using MMX here to free up regular registers for other things */ - __asm__ ( - "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/ - "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/ - "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */ - "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */ - "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */ - "movd %%mm1, (%1) \n\t" /* mm1 -> dst */ - - : : "r" (srcp), "r" (dstp) ); - } - - else { - __asm__ ( - /* load in the source, and dst. */ - "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */ - "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */ - - /* Move the src alpha into mm2 */ - - /* if supporting pshufw */ - /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */ - /*"psrlw $8, %%mm2\n" */ - - /* else: */ - "movd %2, %%mm2\n" - "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ - "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */ - "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */ - "pand %%mm7, %%mm2\n" /* to preserve dest alpha */ - - /* move the colors into words. */ - "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */ - "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */ - - /* src - dst */ - "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */ - - /* A * (src-dst) */ - "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */ - "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */ - "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */ - - "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */ - - "movd %%mm0, (%1)\n" /* result in mm0 */ - - : : "r" (srcp), "r" (dstp), "r" (alpha) ); - - } - ++srcp; - ++dstp; - }, width); - /* *INDENT-ON* */ - srcp += srcskip; - dstp += dstskip; - } - - __asm__("emms\n":); -} - -/* End GCC_ASMBLIT*/ - -#elif MSVC_ASMBLIT +#ifdef __MMX__ /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info) @@ -1775,9 +1391,9 @@ __m64 src1, dst1, mm_alpha, mm_zero, dmask; mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ - /* *INDENT-OFF* */ - multmask = ~(0xFFFFI64 << (ashift * 2)); - /* *INDENT-ON* */ + multmask = 0xFFFF; + multmask <<= (ashift * 2); + multmask = ~multmask; dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */ while (height--) { @@ -1826,9 +1442,7 @@ _mm_empty(); } -/* End MSVC_ASMBLIT */ - -#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ +#endif /* __MMX__ */ /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ @@ -1940,299 +1554,8 @@ } } -#if GCC_ASMBLIT -/* fast RGB565->RGB565 blending with surface alpha */ -static void -Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info) -{ - unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ - if (alpha == 128) { - Blit16to16SurfaceAlpha128(info, 0xf7de); - } else { - int width = info->d_width; - int height = info->d_height; - Uint16 *srcp = (Uint16 *) info->s_pixels; - int srcskip = info->s_skip >> 1; - Uint16 *dstp = (Uint16 *) info->d_pixels; - int dstskip = info->d_skip >> 1; - Uint32 s, d; - Uint8 load[8]; - - alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */ - *(Uint64 *) load = alpha; - alpha >>= 3; /* downscale alpha to 5 bits */ - - movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ - punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ - punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ - /* position alpha to allow for mullo and mulhi on diff channels - to reduce the number of operations */ - psllq_i2r(3, mm0); - - /* Setup the 565 color channel masks */ - *(Uint64 *) load = 0x07E007E007E007E0ULL; - movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ - *(Uint64 *) load = 0x001F001F001F001FULL; - movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ - while (height--) { - /* *INDENT-OFF* */ - DUFFS_LOOP_QUATRO2( - { - s = *srcp++; - d = *dstp; - /* - * shift out the middle component (green) to - * the high 16 bits, and process all three RGB - * components at the same time. - */ - s = (s | s << 16) & 0x07e0f81f; - d = (d | d << 16) & 0x07e0f81f; - d += (s - d) * alpha >> 5; - d &= 0x07e0f81f; - *dstp++ = d | d >> 16; - },{ - s = *srcp++; - d = *dstp; - /* - * shift out the middle component (green) to - * the high 16 bits, and process all three RGB - * components at the same time. - */ - s = (s | s << 16) & 0x07e0f81f; - d = (d | d << 16) & 0x07e0f81f; - d += (s - d) * alpha >> 5; - d &= 0x07e0f81f; - *dstp++ = d | d >> 16; - s = *srcp++; - d = *dstp; - /* - * shift out the middle component (green) to - * the high 16 bits, and process all three RGB - * components at the same time. - */ - s = (s | s << 16) & 0x07e0f81f; - d = (d | d << 16) & 0x07e0f81f; - d += (s - d) * alpha >> 5; - d &= 0x07e0f81f; - *dstp++ = d | d >> 16; - },{ - movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ - movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ - - /* red -- does not need a mask since the right shift clears - the uninteresting bits */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - movq_r2r(mm3, mm6); /* dst -> mm6 */ - psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ - psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - /* alpha used is actually 11 bits - 11 + 5 = 16 bits, so the sign bits are lost */ - psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */ - - movq_r2r(mm6, mm1); /* save new reds in dsts */ - - /* green -- process the bits in place */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ - pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting - bits are gone and the sign bits present */ - psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - - por_r2r(mm6, mm1); /* save new greens in dsts */ - - /* blue */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ - pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - /* 11 + 5 = 16 bits, so the sign bits are lost and - the interesting bits will need to be MASKed */ - psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ - - por_r2r(mm6, mm1); /* save new blues in dsts */ - - movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */ - - srcp += 4; - dstp += 4; - }, width); - /* *INDENT-ON* */ - srcp += srcskip; - dstp += dstskip; - } - emms(); - } -} +#ifdef __MMX__ -/* fast RGB555->RGB555 blending with surface alpha */ -static void -Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info) -{ - unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ - if (alpha == 128) { - Blit16to16SurfaceAlpha128(info, 0xfbde); - } else { - int width = info->d_width; - int height = info->d_height; - Uint16 *srcp = (Uint16 *) info->s_pixels; - int srcskip = info->s_skip >> 1; - Uint16 *dstp = (Uint16 *) info->d_pixels; - int dstskip = info->d_skip >> 1; - Uint32 s, d; - Uint8 load[8]; - - alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */ - *(Uint64 *) load = alpha; - alpha >>= 3; /* downscale alpha to 5 bits */ - - movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ - punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ - punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ - /* position alpha to allow for mullo and mulhi on diff channels - to reduce the number of operations */ - psllq_i2r(3, mm0); - - /* Setup the 555 color channel masks */ - *(Uint64 *) load = 0x03E003E003E003E0ULL; - movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ - *(Uint64 *) load = 0x001F001F001F001FULL; - movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ - while (height--) { - /* *INDENT-OFF* */ - DUFFS_LOOP_QUATRO2( - { - s = *srcp++; - d = *dstp; - /* - * shift out the middle component (green) to - * the high 16 bits, and process all three RGB - * components at the same time. - */ - s = (s | s << 16) & 0x03e07c1f; - d = (d | d << 16) & 0x03e07c1f; - d += (s - d) * alpha >> 5; - d &= 0x03e07c1f; - *dstp++ = d | d >> 16; - },{ - s = *srcp++; - d = *dstp; - /* - * shift out the middle component (green) to - * the high 16 bits, and process all three RGB - * components at the same time. - */ - s = (s | s << 16) & 0x03e07c1f; - d = (d | d << 16) & 0x03e07c1f; - d += (s - d) * alpha >> 5; - d &= 0x03e07c1f; - *dstp++ = d | d >> 16; - s = *srcp++; - d = *dstp; - /* - * shift out the middle component (green) to - * the high 16 bits, and process all three RGB - * components at the same time. - */ - s = (s | s << 16) & 0x03e07c1f; - d = (d | d << 16) & 0x03e07c1f; - d += (s - d) * alpha >> 5; - d &= 0x03e07c1f; - *dstp++ = d | d >> 16; - },{ - movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ - movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ - - /* red -- process the bits in place */ - psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */ - /* by reusing the GREEN mask we free up another mmx - register to accumulate the result */ - - movq_r2r(mm2, mm5); /* src -> mm5 */ - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */ - pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - /* 11 + 15 - 16 = 10 bits, uninteresting bits will be - cleared by a MASK below */ - psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */ - - psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */ - - movq_r2r(mm6, mm1); /* save new reds in dsts */ - - /* green -- process the bits in place */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ - pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting - bits are gone and the sign bits present */ - psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - - por_r2r(mm6, mm1); /* save new greens in dsts */ - - /* blue */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ - pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - /* 11 + 5 = 16 bits, so the sign bits are lost and - the interesting bits will need to be MASKed */ - psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ - - por_r2r(mm6, mm1); /* save new blues in dsts */ - - movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */ - - srcp += 4; - dstp += 4; - }, width); - /* *INDENT-ON* */ - srcp += srcskip; - dstp += dstskip; - } - emms(); - } -} - -/* End GCC_ASMBLIT */ - -#elif MSVC_ASMBLIT /* fast RGB565->RGB565 blending with surface alpha */ static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info) @@ -2507,7 +1830,8 @@ _mm_empty(); } } -#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ + +#endif /* __MMX__ */ /* fast RGB565->RGB565 blending with surface alpha */ static void @@ -2852,14 +2176,14 @@ case 2: if (surface->map->identity) { if (df->Gmask == 0x7e0) { -#if MMX_ASMBLIT +#ifdef __MMX__ if (SDL_HasMMX()) return Blit565to565SurfaceAlphaMMX; else #endif return Blit565to565SurfaceAlpha; } else if (df->Gmask == 0x3e0) { -#if MMX_ASMBLIT +#ifdef __MMX__ if (SDL_HasMMX()) return Blit555to555SurfaceAlphaMMX; else @@ -2873,7 +2197,7 @@ if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { -#if MMX_ASMBLIT +#ifdef __MMX__ if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0 && SDL_HasMMX()) @@ -2928,7 +2252,7 @@ if (sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { -#if MMX_ASMBLIT +#ifdef __MMX__ if (sf->Rshift % 8 == 0 && sf->Gshift % 8 == 0 && sf->Bshift % 8 == 0