Mercurial > sdl-ios-xcode
diff src/video/SDL_blit_A.c @ 1542:a8bf1aa21020
Fixed bug #15
SDL_blit_A.mmx-speed.patch.txt --
Speed improvements and a bugfix for the current GCC inline mmx
asm code:
- Changed some ops and removed some resulting useless ones.
- Added some instruction parallelism (some gain)
The resulting speed on my Xeon improved upto 35% depending on
the function (measured in fps).
- Fixed a bug where BlitRGBtoRGBSurfaceAlphaMMX() was
setting the alpha component on the destination surfaces (to
opaque-alpha) even when the surface had none.
SDL_blit_A.mmx-msvc.patch.txt --
MSVC mmx intrinsics version of the same GCC asm code.
MSVC compiler tries to parallelize the code and to avoid
register stalls, but does not always do a very good job.
Per-surface blending MSVC functions run quite a bit faster
than their pure-asm counterparts (upto 55% faster for 16bit
ones), but the per-pixel blending runs somewhat slower than asm.
- BlitRGBtoRGBSurfaceAlphaMMX and BlitRGBtoRGBPixelAlphaMMX (and all
variants) can now also handle formats other than (A)RGB8888. Formats
like RGBA8888 and some quite exotic ones are allowed -- like
RAGB8888, or actually anything having channels aligned on 8bit
boundary and full 8bit alpha (for per-pixel alpha blending).
The performance cost of this change is virtually 0 for per-surface
alpha blending (no extra ops inside the loop) and a single non-MMX
op inside the loop for per-pixel blending. In testing, the per-pixel
alpha blending takes a ~2% performance hit, but it still runs much
faster than the current code in CVS. If necessary, a separate function
with this functionality can be made.
This code requires Processor Pack for VC6.
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Wed, 15 Mar 2006 15:39:29 +0000 |
parents | dc6b59e925a2 |
children | 4b835e36633d |
line wrap: on
line diff
--- a/src/video/SDL_blit_A.c Wed Mar 15 05:52:31 2006 +0000 +++ b/src/video/SDL_blit_A.c Wed Mar 15 15:39:29 2006 +0000 @@ -24,14 +24,23 @@ #include "SDL_video.h" #include "SDL_blit.h" -#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES +#if SDL_ASSEMBLY_ROUTINES +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) #define MMX_ASMBLIT 1 +#define GCC_ASMBLIT 1 +#elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86) +#define MMX_ASMBLIT 1 +#define MSVC_ASMBLIT 1 #endif +#endif /* SDL_ASSEMBLY_ROUTINES */ /* Function to check the CPU flags */ #include "SDL_cpuinfo.h" -#if MMX_ASMBLIT +#if GCC_ASMBLIT #include "mmx.h" +#elif MSVC_ASMBLIT +#include <mmintrin.h> +#include <mm3dnow.h> #endif /* Functions to perform alpha blended blitting */ @@ -198,7 +207,7 @@ } } -#if MMX_ASMBLIT +#if GCC_ASMBLIT /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) { @@ -208,43 +217,44 @@ int srcskip = info->s_skip >> 2; Uint32 *dstp = (Uint32 *)info->d_pixels; int dstskip = info->d_skip >> 2; - Uint8 load[8]; - - *(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */ - movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */ - *(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */ - movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */ - *(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */ - movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */ + Uint32 dalpha = info->dst->Amask; + Uint8 load[8]; + + *(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */ + movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */ + *(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */ + movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */ + movd_m2r(dalpha, mm7); /* dst alpha mask */ + punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ while(height--) { - DUFFS_LOOP_DOUBLE2( - { - Uint32 s = *srcp++; - Uint32 d = *dstp; - *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) - + (s & d & 0x00010101)) | 0xff000000; - },{ - movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ - movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ - - movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ - movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ - - pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ - pand_r2r(mm4, mm5); /* src & mask -> mm5 */ - paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ - psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ - - pand_r2r(mm1, mm2); /* src & dst -> mm2 */ - pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ - paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ - por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ - movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ - dstp += 2; - srcp += 2; - }, width); - srcp += srcskip; - dstp += dstskip; + DUFFS_LOOP_DOUBLE2( + { + Uint32 s = *srcp++; + Uint32 d = *dstp; + *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) + + (s & d & 0x00010101)) | dalpha; + },{ + movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ + movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ + + movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ + movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ + + pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ + pand_r2r(mm4, mm5); /* src & mask -> mm5 */ + paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ + pand_r2r(mm1, mm2); /* src & dst -> mm2 */ + psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ + pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ + paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ + + por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ + movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ + dstp += 2; + srcp += 2; + }, width); + srcp += srcskip; + dstp += dstskip; } emms(); } @@ -252,8 +262,11 @@ /* fast RGB888->(A)RGB888 blending with surface alpha */ static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) { + SDL_PixelFormat* df = info->dst; unsigned alpha = info->src->alpha; - if(alpha == 128) { + + if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { + /* only call a128 version when R,G,B occupy lower bits */ BlitRGBtoRGBSurfaceAlpha128MMX(info); } else { int width = info->d_width; @@ -262,75 +275,68 @@ int srcskip = info->s_skip >> 2; Uint32 *dstp = (Uint32 *)info->d_pixels; int dstskip = info->d_skip >> 2; - Uint8 load[8] = {alpha, alpha, alpha, alpha, - alpha, alpha, alpha, alpha}; - - movq_m2r(*load, mm4); /* alpha -> mm4 */ - *(Uint64 *)load = 0x00FF00FF00FF00FFULL; - movq_m2r(*load, mm3); /* mask -> mm3 */ - pand_r2r(mm3, mm4); /* mm4 & mask -> 0A0A0A0A -> mm4 */ - *(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */ - movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */ + + pxor_r2r(mm5, mm5); /* 0 -> mm5 */ + /* form the alpha mult */ + movd_m2r(alpha, mm4); /* 0000000A -> mm4 */ + punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ + punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ + alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); + movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */ + punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */ + pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */ + /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */ + movd_m2r(df->Amask, mm7); /* dst alpha mask */ + punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ while(height--) { DUFFS_LOOP_DOUBLE2({ /* One Pixel Blend */ - movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ - punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */ - pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */ - - movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ - movq_r2r(mm2, mm6);/* dst(ARGB) -> mm6 (0000ARGB)*/ - punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */ - pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ - - psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ - pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ - psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ - paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ - pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ - packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ - por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ - movd_r2m(mm2, *dstp);/* mm2 -> Pixel */ + movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ + movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ + punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */ + punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */ + + psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ + pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ + psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ + paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ + + packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */ + por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ + movd_r2m(mm2, *dstp);/* mm2 -> pixel */ ++srcp; ++dstp; },{ - /* Two Pixels Blend */ + /* Two Pixels Blend */ movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/ - movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ - punpcklbw_r2r(mm0, mm0); /* low - AARRGGBB -> mm0 */ - pand_r2r(mm3, mm0); /* 0A0R0G0B -> mm0(src1) */ - punpckhbw_r2r(mm1, mm1); /* high - AARRGGBB -> mm1 */ - pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1(src2) */ - - movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ - movq_r2r(mm2, mm5); /* 2 x dst -> mm5(ARGBARGB) */ - movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ - punpcklbw_r2r(mm2, mm2); /* low - AARRGGBB -> mm2 */ - punpckhbw_r2r(mm6, mm6); /* high - AARRGGBB -> mm6 */ - pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2(dst1) */ - - psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ - pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ - pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6(dst2) */ - psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ - psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ - pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ - paddw_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ - psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm0 */ - pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ - paddw_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ - pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6 */ - packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ - packuswb_r2r(mm6, mm6); /* ARGBARGB -> mm6 */ - psrlq_i2r(32, mm2); /* mm2 >> 32 -> mm2 */ - psllq_i2r(32, mm6); /* mm6 << 32 -> mm6 */ - por_r2r(mm6, mm2); /* mm6 | mm2 -> mm2 */ - por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ - movq_r2m(mm2, *dstp);/* mm2 -> 2 x Pixel */ - srcp += 2; - dstp += 2; - }, width); + movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ + movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ + movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ + + punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */ + punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */ + punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */ + punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */ + + psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ + pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ + psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ + paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ + + psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ + pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ + psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ + paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ + + packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */ + por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */ + + movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */ + + srcp += 2; + dstp += 2; + }, width); srcp += srcskip; dstp += dstskip; } @@ -347,62 +353,65 @@ int srcskip = info->s_skip >> 2; Uint32 *dstp = (Uint32 *)info->d_pixels; int dstskip = info->d_skip >> 2; - Uint32 alpha = 0; - Uint8 load[8]; - - *(Uint64 *)load = 0x00FF00FF00FF00FFULL; - movq_m2r(*load, mm3); /* mask -> mm2 */ - *(Uint64 *)load = 0x00FF000000000000ULL; - movq_m2r(*load, mm7); /* dst alpha mask -> mm2 */ - *(Uint64 *)load = 0x00FFFFFF00FFFFFFULL; - movq_m2r(*load, mm0); /* alpha 255 mask -> mm0 */ - *(Uint64 *)load = 0xFF000000FF000000ULL; - movq_m2r(*load, mm6); /* alpha 255 !mask -> mm6 */ + SDL_PixelFormat* sf = info->src; + Uint32 amask = sf->Amask; + + pxor_r2r(mm6, mm6); /* 0 -> mm6 */ + /* form multiplication mask */ + movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */ + punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */ + pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */ + movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */ + pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */ + /* form channel masks */ + movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */ + packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */ + packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */ + pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */ + /* get alpha channel shift */ + movd_m2r(sf->Ashift, mm5); /* Ashift -> mm5 */ + while(height--) { DUFFS_LOOP4({ - alpha = *srcp; - alpha >>= 24; + Uint32 alpha = *srcp & amask; /* FIXME: Here we special-case opaque alpha since the - compositioning used (>>8 instead of /255) doesn't handle - it correctly. Also special-case alpha=0 for speed? - Benchmark this! */ - if(alpha) { - if(alpha == SDL_ALPHA_OPAQUE) { - movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ - movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ - pand_r2r(mm0, mm1); - pand_r2r(mm6, mm2); - por_r2r(mm1, mm2); - movd_r2m(mm2, (*dstp)); - } else { - movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ - punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */ - pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */ - - movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ - punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */ - pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ - - movq_r2r(mm2, mm5);/* mm2(0A0R0G0B) -> mm5 */ - pand_r2r(mm7, mm5); /* mm5 & dst alpha mask -> mm5(0A000000) */ - psrlq_i2r(24, mm5); /* mm5 >> 24 -> mm5 (0000A000)*/ - - movq_r2r(mm1, mm4);/* mm1(0A0R0G0B) -> mm4 */ - psrlq_i2r(48, mm4); /* mm4 >> 48 -> mm4(0000000A) */ - punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ - punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ - - /* blend */ - psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ - pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ - psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ - paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ - pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ - packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ - pand_r2r(mm0, mm2); /* 0RGB0RGB -> mm2 */ - por_r2r(mm5, mm2); /* dst alpha | mm2 -> mm2 */ - movd_r2m(mm2, *dstp);/* mm2 -> dst */ - } + compositioning used (>>8 instead of /255) doesn't handle + it correctly. Also special-case alpha=0 for speed? + Benchmark this! */ + if(alpha == 0) { + /* do nothing */ + } else if(alpha == amask) { + /* opaque alpha -- copy RGB, keep dst alpha */ + /* using MMX here to free up regular registers for other things */ + movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ + movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ + pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */ + pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */ + por_r2r(mm1, mm2); /* src | dst -> mm2 */ + movd_r2m(mm2, (*dstp)); /* mm2 -> dst */ + } else { + movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ + punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */ + + movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ + punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */ + + __asm__ __volatile__ ( + "movd %0, %%mm4" + : : "r" (alpha) ); /* 0000A000 -> mm4 */ + psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */ + punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ + punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ + pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */ + + /* blend */ + psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ + pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ + psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */ + paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ + + packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */ + movd_r2m(mm2, *dstp);/* mm2 -> dst */ } ++srcp; ++dstp; @@ -412,7 +421,220 @@ } emms(); } -#endif +/* End GCC_ASMBLIT */ + +#elif MSVC_ASMBLIT +/* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ +static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) +{ + int width = info->d_width; + int height = info->d_height; + Uint32 *srcp = (Uint32 *)info->s_pixels; + int srcskip = info->s_skip >> 2; + Uint32 *dstp = (Uint32 *)info->d_pixels; + int dstskip = info->d_skip >> 2; + Uint32 dalpha = info->dst->Amask; + + __m64 src1, src2, dst1, dst2, lmask, hmask, dsta; + + hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */ + lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */ + dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ + + while (height--) { + int n = width; + if ( n & 1 ) { + Uint32 s = *srcp++; + Uint32 d = *dstp; + *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) + + (s & d & 0x00010101)) | dalpha; + n--; + } + + for (n >>= 1; n > 0; --n) { + dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */ + dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ + + src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */ + src2 = src1; /* 2 x src -> src2(ARGBARGB) */ + + dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */ + src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */ + src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */ + src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */ + + dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */ + dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */ + dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */ + dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */ + + *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */ + dstp += 2; + srcp += 2; + } + + srcp += srcskip; + dstp += dstskip; + } + _mm_empty(); +} + +/* fast RGB888->(A)RGB888 blending with surface alpha */ +static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) +{ + SDL_PixelFormat* df = info->dst; + Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask; + unsigned alpha = info->src->alpha; + + if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { + /* only call a128 version when R,G,B occupy lower bits */ + BlitRGBtoRGBSurfaceAlpha128MMX(info); + } else { + int width = info->d_width; + int height = info->d_height; + Uint32 *srcp = (Uint32 *)info->s_pixels; + int srcskip = info->s_skip >> 2; + Uint32 *dstp = (Uint32 *)info->d_pixels; + int dstskip = info->d_skip >> 2; + Uint32 dalpha = df->Amask; + Uint32 amult; + + __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta; + + mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ + /* form the alpha mult */ + amult = alpha | (alpha << 8); + amult = amult | (amult << 16); + chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); + mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */ + mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */ + /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */ + dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ + + while (height--) { + int n = width; + if (n & 1) { + /* One Pixel Blend */ + src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/ + src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */ + + dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ + dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ + + src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */ + src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ + src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ + dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */ + + dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ + dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ + *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ + + ++srcp; + ++dstp; + + n--; + } + + for (n >>= 1; n > 0; --n) { + /* Two Pixels Blend */ + src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/ + src2 = src1; /* 2 x src -> src2(ARGBARGB) */ + src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */ + src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */ + + dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */ + dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ + dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */ + dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */ + + src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ + src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */ + src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */ + dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */ + + src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */ + src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ + src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ + dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */ + + dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */ + dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ + + *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */ + + srcp += 2; + dstp += 2; + } + srcp += srcskip; + dstp += dstskip; + } + _mm_empty(); + } +} + +/* fast ARGB888->(A)RGB888 blending with pixel alpha */ +static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) +{ + int width = info->d_width; + int height = info->d_height; + Uint32 *srcp = (Uint32 *)info->s_pixels; + int srcskip = info->s_skip >> 2; + Uint32 *dstp = (Uint32 *)info->d_pixels; + int dstskip = info->d_skip >> 2; + SDL_PixelFormat* sf = info->src; + Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; + Uint32 amask = sf->Amask; + Uint32 ashift = sf->Ashift; + Uint64 multmask; + + __m64 src1, dst1, mm_alpha, mm_zero, dmask; + + mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ + multmask = ~(0xFFFFi64 << (ashift * 2)); + dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ + + while(height--) { + DUFFS_LOOP4({ + Uint32 alpha = *srcp & amask; + if (alpha == 0) { + /* do nothing */ + } else if (alpha == amask) { + /* opaque alpha -- copy RGB, keep dst alpha */ + *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); + } else { + src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ + src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ + + dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ + dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ + + mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ + mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ + mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ + mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ + mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ + + /* blend */ + src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ + src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */ + src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ + dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */ + dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ + + *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ + } + ++srcp; + ++dstp; + }, width); + srcp += srcskip; + dstp += dstskip; + } + _mm_empty(); +} +/* End MSVC_ASMBLIT */ + +#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ #if SDL_ALTIVEC_BLITTERS #if HAVE_ALTIVEC_H @@ -1326,7 +1548,7 @@ } } -#if MMX_ASMBLIT +#if GCC_ASMBLIT /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) { @@ -1336,38 +1558,61 @@ int srcskip = info->s_skip >> 2; Uint32 *dstp = (Uint32 *)info->d_pixels; int dstskip = info->d_skip >> 2; - - Uint32 s; - Uint32 alpha; + SDL_PixelFormat* sf = info->src; + Uint32 amask = sf->Amask; __asm__ ( /* make mm6 all zeros. */ "pxor %%mm6, %%mm6\n" /* Make a mask to preserve the alpha. */ - "pcmpeqb %%mm7, %%mm7\n\t" /* mm7(s) = FF FF FF FF | FF FF FF FF */ - "psrlq $16, %%mm7\n\t" /* mm7(s) = 00 00 FF FF | FF FF FF FF */ + "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */ + "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */ + "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */ + "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */ + "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */ - : ); + /* form channel masks */ + "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */ + "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */ + "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */ + "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */ + + /* get alpha channel shift */ + "movd %1, %%mm5\n\t" /* Ashift -> mm5 */ + + : /* nothing */ : "m" (sf->Amask), "m" (sf->Ashift) ); while(height--) { DUFFS_LOOP4({ + Uint32 alpha; __asm__ ( "prefetch 64(%0)\n" "prefetch 64(%1)\n" : : "r" (srcp), "r" (dstp) ); - s = *srcp; - alpha = s >> 24; + alpha = *srcp & amask; /* FIXME: Here we special-case opaque alpha since the compositioning used (>>8 instead of /255) doesn't handle it correctly. Also special-case alpha=0 for speed? Benchmark this! */ - - if(alpha == SDL_ALPHA_OPAQUE) { - *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); + if(alpha == 0) { + /* do nothing */ + } + else if(alpha == amask) { + /* opaque alpha -- copy RGB, keep dst alpha */ + /* using MMX here to free up regular registers for other things */ + __asm__ ( + "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/ + "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/ + "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */ + "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */ + "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */ + "movd %%mm1, (%1) \n\t" /* mm1 -> dst */ + + : : "r" (srcp), "r" (dstp) ); } else { @@ -1383,10 +1628,11 @@ /*"psrlw $8, %%mm2\n" */ /* else: */ - "movq %%mm0, %%mm2\n" - "psrld $24, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ + "movd %2, %%mm2\n" + "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */ "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */ + "pand %%mm7, %%mm2\n" /* to preserve dest alpha */ /* move the colors into words. */ "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */ @@ -1396,16 +1642,15 @@ "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */ /* A * (src-dst) */ - "pmullw %%mm2, %%mm0\n" /* mm0 = As*As-d As*Rs-d | As*Gs-d As*Bs-d */ - "pand %%mm7, %%mm0\n" /* to preserve dest alpha */ - "psrlw $8, %%mm0\n" /* mm0 = Ac>>8 Rc>>8 | Gc>>8 Bc>>8 */ - "paddb %%mm1, %%mm0\n" /* mm0 = Ac+Ad Rc+Rd | Gc+Gd Bc+Bd */ + "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */ + "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */ + "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */ "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */ "movd %%mm0, (%1)\n" /* result in mm0 */ - : : "r" (srcp), "r" (dstp) ); + : : "r" (srcp), "r" (dstp), "r" (alpha) ); } ++srcp; @@ -1419,7 +1664,76 @@ "emms\n" : ); } -#endif +/* End GCC_ASMBLIT*/ + +#elif MSVC_ASMBLIT +/* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ +static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) +{ + int width = info->d_width; + int height = info->d_height; + Uint32 *srcp = (Uint32 *)info->s_pixels; + int srcskip = info->s_skip >> 2; + Uint32 *dstp = (Uint32 *)info->d_pixels; + int dstskip = info->d_skip >> 2; + SDL_PixelFormat* sf = info->src; + Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; + Uint32 amask = sf->Amask; + Uint32 ashift = sf->Ashift; + Uint64 multmask; + + __m64 src1, dst1, mm_alpha, mm_zero, dmask; + + mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ + multmask = ~(0xFFFFi64 << (ashift * 2)); + dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ + + while(height--) { + DUFFS_LOOP4({ + Uint32 alpha; + + _m_prefetch(srcp + 16); + _m_prefetch(dstp + 16); + + alpha = *srcp & amask; + if (alpha == 0) { + /* do nothing */ + } else if (alpha == amask) { + /* copy RGB, keep dst alpha */ + *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); + } else { + src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ + src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ + + dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ + dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ + + mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ + mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ + mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ + mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ + mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ + + /* blend */ + src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */ + src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */ + src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ + dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */ + dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ + + *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ + } + ++srcp; + ++dstp; + }, width); + srcp += srcskip; + dstp += dstskip; + } + _mm_empty(); +} +/* End MSVC_ASMBLIT */ + +#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ @@ -1530,7 +1844,7 @@ } } -#if MMX_ASMBLIT +#if GCC_ASMBLIT /* fast RGB565->RGB565 blending with surface alpha */ static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) { @@ -1544,28 +1858,29 @@ int srcskip = info->s_skip >> 1; Uint16 *dstp = (Uint16 *)info->d_pixels; int dstskip = info->d_skip >> 1; - Uint32 s, d; - Uint8 load[8]; + Uint32 s, d; + Uint8 load[8]; alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ - *(Uint64 *)load = alpha; + *(Uint64 *)load = alpha; alpha >>= 3; /* downscale alpha to 5 bits */ - movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ - punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ - punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ + movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ + punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ + punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ + /* position alpha to allow for mullo and mulhi on diff channels + to reduce the number of operations */ + psllq_i2r(3, mm0); - /* Setup the 565 color channel masks */ - *(Uint64 *)load = 0xF800F800F800F800ULL; - movq_m2r(*load, mm1); /* MASKRED -> mm1 */ + /* Setup the 565 color channel masks */ *(Uint64 *)load = 0x07E007E007E007E0ULL; movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ *(Uint64 *)load = 0x001F001F001F001FULL; movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ while(height--) { - DUFFS_LOOP_QUATRO2( - { - s = *srcp++; + DUFFS_LOOP_QUATRO2( + { + s = *srcp++; d = *dstp; /* * shift out the middle component (green) to @@ -1577,8 +1892,8 @@ d += (s - d) * alpha >> 5; d &= 0x07e0f81f; *dstp++ = d | d >> 16; - },{ - s = *srcp++; + },{ + s = *srcp++; d = *dstp; /* * shift out the middle component (green) to @@ -1590,7 +1905,7 @@ d += (s - d) * alpha >> 5; d &= 0x07e0f81f; *dstp++ = d | d >> 16; - s = *srcp++; + s = *srcp++; d = *dstp; /* * shift out the middle component (green) to @@ -1602,78 +1917,66 @@ d += (s - d) * alpha >> 5; d &= 0x07e0f81f; *dstp++ = d | d >> 16; - },{ - movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ - movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ - - /* RED */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */ - psrlq_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ - - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */ - psrlq_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - psllq_i2r(11, mm6); /* mm6 << 11 -> mm6 */ - pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */ - - movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */ - por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ - pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */ - por_r2r(mm6, mm3); /* save new reds in dsts */ - - /* green */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */ - psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */ - - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */ - psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */ - pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */ - - movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ - por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ - pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */ - por_r2r(mm6, mm3); /* save new greens in dsts */ - - /* blue */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */ - - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */ - - movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ - por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */ - pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */ - por_r2r(mm6, mm3); /* save new blues in dsts */ - - movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */ - - srcp += 4; - dstp += 4; - }, width); + },{ + movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ + movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ + + /* red -- does not need a mask since the right shift clears + the uninteresting bits */ + movq_r2r(mm2, mm5); /* src -> mm5 */ + movq_r2r(mm3, mm6); /* dst -> mm6 */ + psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ + psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ + + /* blend */ + psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ + pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ + /* alpha used is actually 11 bits + 11 + 5 = 16 bits, so the sign bits are lost */ + psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ + paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ + psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */ + + movq_r2r(mm6, mm1); /* save new reds in dsts */ + + /* green -- process the bits in place */ + movq_r2r(mm2, mm5); /* src -> mm5 */ + movq_r2r(mm3, mm6); /* dst -> mm6 */ + pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ + pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ + + /* blend */ + psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ + pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ + /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting + bits are gone and the sign bits present */ + psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ + paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ + + por_r2r(mm6, mm1); /* save new greens in dsts */ + + /* blue */ + movq_r2r(mm2, mm5); /* src -> mm5 */ + movq_r2r(mm3, mm6); /* dst -> mm6 */ + pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ + pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ + + /* blend */ + psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ + pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ + /* 11 + 5 = 16 bits, so the sign bits are lost and + the interesting bits will need to be MASKed */ + psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ + paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ + pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ + + por_r2r(mm6, mm1); /* save new blues in dsts */ + + movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */ + + srcp += 4; + dstp += 4; + }, width); srcp += srcskip; dstp += dstskip; } @@ -1694,28 +1997,29 @@ int srcskip = info->s_skip >> 1; Uint16 *dstp = (Uint16 *)info->d_pixels; int dstskip = info->d_skip >> 1; - Uint32 s, d; - Uint8 load[8]; + Uint32 s, d; + Uint8 load[8]; alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ - *(Uint64 *)load = alpha; + *(Uint64 *)load = alpha; alpha >>= 3; /* downscale alpha to 5 bits */ - movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ - punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ - punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ - - /* Setup the 555 color channel masks */ - *(Uint64 *)load = 0x7C007C007C007C00ULL; - movq_m2r(*load, mm1); /* MASKRED -> mm1 */ + movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ + punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ + punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ + /* position alpha to allow for mullo and mulhi on diff channels + to reduce the number of operations */ + psllq_i2r(3, mm0); + + /* Setup the 555 color channel masks */ *(Uint64 *)load = 0x03E003E003E003E0ULL; movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ *(Uint64 *)load = 0x001F001F001F001FULL; movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ while(height--) { - DUFFS_LOOP_QUATRO2( - { - s = *srcp++; + DUFFS_LOOP_QUATRO2( + { + s = *srcp++; d = *dstp; /* * shift out the middle component (green) to @@ -1727,8 +2031,8 @@ d += (s - d) * alpha >> 5; d &= 0x03e07c1f; *dstp++ = d | d >> 16; - },{ - s = *srcp++; + },{ + s = *srcp++; d = *dstp; /* * shift out the middle component (green) to @@ -1752,85 +2056,349 @@ d += (s - d) * alpha >> 5; d &= 0x03e07c1f; *dstp++ = d | d >> 16; - },{ - movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ - movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ - - /* RED */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */ - psrlq_i2r(10, mm5); /* mm5 >> 10 -> mm5 [000r 000r 000r 000r] */ - - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */ - psrlq_i2r(10, mm6); /* mm6 >> 10 -> mm6 [000r 000r 000r 000r] */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - psllq_i2r(10, mm6); /* mm6 << 10 -> mm6 */ - pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */ - - movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */ - por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ - pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */ - por_r2r(mm6, mm3); /* save new reds in dsts */ - - /* green */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */ - psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */ - - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */ - psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */ - pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */ - - movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ - por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ - pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */ - por_r2r(mm6, mm3); /* save new greens in dsts */ - - /* blue */ - movq_r2r(mm2, mm5); /* src -> mm5 */ - pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */ - - movq_r2r(mm3, mm6); /* dst -> mm6 */ - pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ - - /* blend */ - psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ - pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ - psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ - paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ - pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */ - - movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ - por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */ - pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */ - por_r2r(mm6, mm3); /* save new blues in dsts */ - - movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */ - - srcp += 4; - dstp += 4; - }, width); + },{ + movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ + movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ + + /* red -- process the bits in place */ + psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */ + /* by reusing the GREEN mask we free up another mmx + register to accumulate the result */ + + movq_r2r(mm2, mm5); /* src -> mm5 */ + movq_r2r(mm3, mm6); /* dst -> mm6 */ + pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */ + pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */ + + /* blend */ + psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ + pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ + /* 11 + 15 - 16 = 10 bits, uninteresting bits will be + cleared by a MASK below */ + psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ + paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ + pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */ + + psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */ + + movq_r2r(mm6, mm1); /* save new reds in dsts */ + + /* green -- process the bits in place */ + movq_r2r(mm2, mm5); /* src -> mm5 */ + movq_r2r(mm3, mm6); /* dst -> mm6 */ + pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ + pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ + + /* blend */ + psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ + pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ + /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting + bits are gone and the sign bits present */ + psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ + paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ + + por_r2r(mm6, mm1); /* save new greens in dsts */ + + /* blue */ + movq_r2r(mm2, mm5); /* src -> mm5 */ + movq_r2r(mm3, mm6); /* dst -> mm6 */ + pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ + pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ + + /* blend */ + psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ + pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ + /* 11 + 5 = 16 bits, so the sign bits are lost and + the interesting bits will need to be MASKed */ + psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ + paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ + pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ + + por_r2r(mm6, mm1); /* save new blues in dsts */ + + movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */ + + srcp += 4; + dstp += 4; + }, width); srcp += srcskip; dstp += dstskip; } emms(); } } -#endif +/* End GCC_ASMBLIT */ + +#elif MSVC_ASMBLIT +/* fast RGB565->RGB565 blending with surface alpha */ +static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) +{ + unsigned alpha = info->src->alpha; + if(alpha == 128) { + Blit16to16SurfaceAlpha128(info, 0xf7de); + } else { + int width = info->d_width; + int height = info->d_height; + Uint16 *srcp = (Uint16 *)info->s_pixels; + int srcskip = info->s_skip >> 1; + Uint16 *dstp = (Uint16 *)info->d_pixels; + int dstskip = info->d_skip >> 1; + Uint32 s, d; + + __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha; + + alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ + mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ + alpha >>= 3; /* downscale alpha to 5 bits */ + + mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ + mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ + /* position alpha to allow for mullo and mulhi on diff channels + to reduce the number of operations */ + mm_alpha = _mm_slli_si64(mm_alpha, 3); + + /* Setup the 565 color channel masks */ + gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */ + bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ + + while(height--) { + DUFFS_LOOP_QUATRO2( + { + s = *srcp++; + d = *dstp; + /* + * shift out the middle component (green) to + * the high 16 bits, and process all three RGB + * components at the same time. + */ + s = (s | s << 16) & 0x07e0f81f; + d = (d | d << 16) & 0x07e0f81f; + d += (s - d) * alpha >> 5; + d &= 0x07e0f81f; + *dstp++ = d | d >> 16; + },{ + s = *srcp++; + d = *dstp; + /* + * shift out the middle component (green) to + * the high 16 bits, and process all three RGB + * components at the same time. + */ + s = (s | s << 16) & 0x07e0f81f; + d = (d | d << 16) & 0x07e0f81f; + d += (s - d) * alpha >> 5; + d &= 0x07e0f81f; + *dstp++ = d | d >> 16; + s = *srcp++; + d = *dstp; + /* + * shift out the middle component (green) to + * the high 16 bits, and process all three RGB + * components at the same time. + */ + s = (s | s << 16) & 0x07e0f81f; + d = (d | d << 16) & 0x07e0f81f; + d += (s - d) * alpha >> 5; + d &= 0x07e0f81f; + *dstp++ = d | d >> 16; + },{ + src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ + dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ + + /* red */ + src2 = src1; + src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */ + + dst2 = dst1; + dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */ + + /* blend */ + src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ + src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ + src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ + dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ + dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */ + + mm_res = dst2; /* RED -> mm_res */ + + /* green -- process the bits in place */ + src2 = src1; + src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ + + dst2 = dst1; + dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ + + /* blend */ + src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ + src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ + src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ + dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ + + mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ + + /* blue */ + src2 = src1; + src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ + + dst2 = dst1; + dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ + + /* blend */ + src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ + src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ + src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ + dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ + dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ + + mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ + + *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ + + srcp += 4; + dstp += 4; + }, width); + srcp += srcskip; + dstp += dstskip; + } + _mm_empty(); + } +} + +/* fast RGB555->RGB555 blending with surface alpha */ +static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) +{ + unsigned alpha = info->src->alpha; + if(alpha == 128) { + Blit16to16SurfaceAlpha128(info, 0xfbde); + } else { + int width = info->d_width; + int height = info->d_height; + Uint16 *srcp = (Uint16 *)info->s_pixels; + int srcskip = info->s_skip >> 1; + Uint16 *dstp = (Uint16 *)info->d_pixels; + int dstskip = info->d_skip >> 1; + Uint32 s, d; + + __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha; + + alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ + mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ + alpha >>= 3; /* downscale alpha to 5 bits */ + + mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ + mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ + /* position alpha to allow for mullo and mulhi on diff channels + to reduce the number of operations */ + mm_alpha = _mm_slli_si64(mm_alpha, 3); + + /* Setup the 555 color channel masks */ + rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */ + gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */ + bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ + + while(height--) { + DUFFS_LOOP_QUATRO2( + { + s = *srcp++; + d = *dstp; + /* + * shift out the middle component (green) to + * the high 16 bits, and process all three RGB + * components at the same time. + */ + s = (s | s << 16) & 0x03e07c1f; + d = (d | d << 16) & 0x03e07c1f; + d += (s - d) * alpha >> 5; + d &= 0x03e07c1f; + *dstp++ = d | d >> 16; + },{ + s = *srcp++; + d = *dstp; + /* + * shift out the middle component (green) to + * the high 16 bits, and process all three RGB + * components at the same time. + */ + s = (s | s << 16) & 0x03e07c1f; + d = (d | d << 16) & 0x03e07c1f; + d += (s - d) * alpha >> 5; + d &= 0x03e07c1f; + *dstp++ = d | d >> 16; + s = *srcp++; + d = *dstp; + /* + * shift out the middle component (green) to + * the high 16 bits, and process all three RGB + * components at the same time. + */ + s = (s | s << 16) & 0x03e07c1f; + d = (d | d << 16) & 0x03e07c1f; + d += (s - d) * alpha >> 5; + d &= 0x03e07c1f; + *dstp++ = d | d >> 16; + },{ + src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ + dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ + + /* red -- process the bits in place */ + src2 = src1; + src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */ + + dst2 = dst1; + dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */ + + /* blend */ + src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ + src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ + src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ + dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ + dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */ + + mm_res = dst2; /* RED -> mm_res */ + + /* green -- process the bits in place */ + src2 = src1; + src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ + + dst2 = dst1; + dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ + + /* blend */ + src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ + src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ + src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ + dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ + + mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ + + /* blue */ + src2 = src1; /* src -> src2 */ + src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ + + dst2 = dst1; /* dst -> dst2 */ + dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ + + /* blend */ + src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ + src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ + src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ + dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ + dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ + + mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ + + *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ + + srcp += 4; + dstp += 4; + }, width); + srcp += srcskip; + dstp += dstskip; + } + _mm_empty(); + } +} +#endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ /* fast RGB565->RGB565 blending with surface alpha */ static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) @@ -2177,29 +2745,31 @@ if(sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask - && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff && sf->BytesPerPixel == 4) { #if MMX_ASMBLIT - if(SDL_HasMMX()) - return BlitRGBtoRGBSurfaceAlphaMMX; + if(sf->Rshift % 8 == 0 + && sf->Gshift % 8 == 0 + && sf->Bshift % 8 == 0 + && SDL_HasMMX()) + return BlitRGBtoRGBSurfaceAlphaMMX; +#endif + if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) + { +#if USE_ALTIVEC_BLITTERS + if(SDL_HasAltiVec()) + return BlitRGBtoRGBSurfaceAlphaAltivec; +#endif + return BlitRGBtoRGBSurfaceAlpha; + } + } +#if SDL_ALTIVEC_BLITTERS + if((sf->BytesPerPixel == 4) && + !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) + return Blit32to32SurfaceAlphaAltivec; else #endif -#if SDL_ALTIVEC_BLITTERS - if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) - return BlitRGBtoRGBSurfaceAlphaAltivec; - else -#endif - return BlitRGBtoRGBSurfaceAlpha; - } - else -#if SDL_ALTIVEC_BLITTERS - if((sf->BytesPerPixel == 4) && - !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) - return Blit32to32SurfaceAlphaAltivec; - else -#endif - return BlitNtoNSurfaceAlpha; + return BlitNtoNSurfaceAlpha; case 3: default: @@ -2232,34 +2802,40 @@ return BlitNtoNPixelAlpha; case 4: - if(sf->Amask == 0xff000000 - && sf->Rmask == df->Rmask + if(sf->Rmask == df->Rmask && sf->Gmask == df->Gmask && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { #if MMX_ASMBLIT - if(SDL_Has3DNow()) - return BlitRGBtoRGBPixelAlphaMMX3DNOW; - else - if(SDL_HasMMX()) - return BlitRGBtoRGBPixelAlphaMMX; - else + if(sf->Rshift % 8 == 0 + && sf->Gshift % 8 == 0 + && sf->Bshift % 8 == 0 + && sf->Ashift % 8 == 0 + && sf->Aloss == 0) + { + if(SDL_Has3DNow()) + return BlitRGBtoRGBPixelAlphaMMX3DNOW; + if(SDL_HasMMX()) + return BlitRGBtoRGBPixelAlphaMMX; + } #endif -#if SDL_ALTIVEC_BLITTERS - if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) - return BlitRGBtoRGBPixelAlphaAltivec; - else + if(sf->Amask == 0xff000000) + { +#if USE_ALTIVEC_BLITTERS + if(SDL_HasAltiVec()) + return BlitRGBtoRGBPixelAlphaAltivec; #endif - return BlitRGBtoRGBPixelAlpha; + return BlitRGBtoRGBPixelAlpha; + } } #if SDL_ALTIVEC_BLITTERS - if (sf->Amask && sf->BytesPerPixel == 4 && - !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) - return Blit32to32PixelAlphaAltivec; - else + if (sf->Amask && sf->BytesPerPixel == 4 && + !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) + return Blit32to32PixelAlphaAltivec; + else #endif - return BlitNtoNPixelAlpha; + return BlitNtoNPixelAlpha; case 3: default: