comparison src/video/SDL_blit_A.c @ 2255:17b2369756be

Use MMX intrinsics over GCC inline assembly
author Sam Lantinga <slouken@libsdl.org>
date Thu, 16 Aug 2007 22:18:53 +0000
parents 6630fefab312
children 340942cfda48
comparison
equal deleted inserted replaced
2254:79e00f5561f4 2255:17b2369756be
21 */ 21 */
22 #include "SDL_config.h" 22 #include "SDL_config.h"
23 23
24 #include "SDL_video.h" 24 #include "SDL_video.h"
25 #include "SDL_blit.h" 25 #include "SDL_blit.h"
26
27 /*
28 In Visual C, VC6 has mmintrin.h in the "Processor Pack" add-on.
29 Checking if _mm_free is #defined in malloc.h is is the only way to
30 determine if the Processor Pack is installed, as far as I can tell.
31 */
32
33 #if SDL_ASSEMBLY_ROUTINES
34 # if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
35 # define MMX_ASMBLIT 1
36 # define GCC_ASMBLIT 1
37 # elif defined(_MSC_VER) && defined(_M_IX86)
38 # if (_MSC_VER <= 1200)
39 # include <malloc.h>
40 # if defined(_mm_free)
41 # define HAVE_MMINTRIN_H 1
42 # endif
43 # else /* Visual Studio > VC6 always has mmintrin.h */
44 # define HAVE_MMINTRIN_H 1
45 # endif
46 # if HAVE_MMINTRIN_H
47 # define MMX_ASMBLIT 1
48 # define MSVC_ASMBLIT 1
49 # endif
50 # endif
51 #endif /* SDL_ASSEMBLY_ROUTINES */
52
53 /* Function to check the CPU flags */
54 #include "SDL_cpuinfo.h"
55 #if GCC_ASMBLIT
56 #include "mmx.h"
57 #elif MSVC_ASMBLIT
58 #include <mmintrin.h>
59 #include <mm3dnow.h>
60 #endif
61 26
62 /* Functions to perform alpha blended blitting */ 27 /* Functions to perform alpha blended blitting */
63 28
64 /* N->1 blending with per-surface alpha */ 29 /* N->1 blending with per-surface alpha */
65 static void 30 static void
230 src += srcskip; 195 src += srcskip;
231 dst += dstskip; 196 dst += dstskip;
232 } 197 }
233 } 198 }
234 199
235 #if GCC_ASMBLIT 200 #ifdef __MMX__
236 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 201
237 static void
238 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
239 {
240 int width = info->d_width;
241 int height = info->d_height;
242 Uint32 *srcp = (Uint32 *) info->s_pixels;
243 int srcskip = info->s_skip >> 2;
244 Uint32 *dstp = (Uint32 *) info->d_pixels;
245 int dstskip = info->d_skip >> 2;
246 Uint32 dalpha = info->dst->Amask;
247 Uint8 load[8];
248
249 *(Uint64 *) load = 0x00fefefe00fefefeULL; /* alpha128 mask */
250 movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
251 *(Uint64 *) load = 0x0001010100010101ULL; /* !alpha128 mask */
252 movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
253 movd_m2r(dalpha, mm7); /* dst alpha mask */
254 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
255 while (height--) {
256 /* *INDENT-OFF* */
257 DUFFS_LOOP_DOUBLE2(
258 {
259 Uint32 s = *srcp++;
260 Uint32 d = *dstp;
261 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
262 + (s & d & 0x00010101)) | dalpha;
263 },{
264 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
265 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
266
267 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
268 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
269
270 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
271 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
272 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
273 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
274 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
275 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
276 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
277
278 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
279 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
280 dstp += 2;
281 srcp += 2;
282 }, width);
283 /* *INDENT-ON* */
284 srcp += srcskip;
285 dstp += dstskip;
286 }
287 emms();
288 }
289
290 /* fast RGB888->(A)RGB888 blending with surface alpha */
291 static void
292 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
293 {
294 SDL_PixelFormat *df = info->dst;
295 unsigned alpha = info->src->alpha;
296
297 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
298 /* only call a128 version when R,G,B occupy lower bits */
299 BlitRGBtoRGBSurfaceAlpha128MMX(info);
300 } else {
301 int width = info->d_width;
302 int height = info->d_height;
303 Uint32 *srcp = (Uint32 *) info->s_pixels;
304 int srcskip = info->s_skip >> 2;
305 Uint32 *dstp = (Uint32 *) info->d_pixels;
306 int dstskip = info->d_skip >> 2;
307
308 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
309 /* form the alpha mult */
310 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
311 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
312 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
313 alpha =
314 (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->
315 Bshift);
316 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
317 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
318 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
319 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
320 movd_m2r(df->Amask, mm7); /* dst alpha mask */
321 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
322
323 while (height--) {
324 /* *INDENT-OFF* */
325 DUFFS_LOOP_DOUBLE2({
326 /* One Pixel Blend */
327 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
328 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
329 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
330 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
331
332 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
333 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
334 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
335 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
336
337 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
338 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
339 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
340 ++srcp;
341 ++dstp;
342 },{
343 /* Two Pixels Blend */
344 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
345 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
346 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
347 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
348
349 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
350 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
351 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
352 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
353
354 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
355 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
356 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
357 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
358
359 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
360 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
361 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
362 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
363
364 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
365 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
366
367 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
368
369 srcp += 2;
370 dstp += 2;
371 }, width);
372 /* *INDENT-ON* */
373 srcp += srcskip;
374 dstp += dstskip;
375 }
376 emms();
377 }
378 }
379
380 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
381 static void
382 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
383 {
384 int width = info->d_width;
385 int height = info->d_height;
386 Uint32 *srcp = (Uint32 *) info->s_pixels;
387 int srcskip = info->s_skip >> 2;
388 Uint32 *dstp = (Uint32 *) info->d_pixels;
389 int dstskip = info->d_skip >> 2;
390 SDL_PixelFormat *sf = info->src;
391 Uint32 amask = sf->Amask;
392
393 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
394 /* form multiplication mask */
395 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
396 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
397 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
398 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
399 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
400 /* form channel masks */
401 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
402 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
403 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
404 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
405 /* get alpha channel shift */
406 /* *INDENT-OFF* */
407 __asm__ __volatile__ (
408 "movd %0, %%mm5"
409 : : "rm" ((Uint32) sf->Ashift) ); /* Ashift -> mm5 */
410 /* *INDENT-ON* */
411
412 while (height--) {
413 /* *INDENT-OFF* */
414 DUFFS_LOOP4({
415 Uint32 alpha = *srcp & amask;
416 /* FIXME: Here we special-case opaque alpha since the
417 compositioning used (>>8 instead of /255) doesn't handle
418 it correctly. Also special-case alpha=0 for speed?
419 Benchmark this! */
420 if(alpha == 0) {
421 /* do nothing */
422 } else if(alpha == amask) {
423 /* opaque alpha -- copy RGB, keep dst alpha */
424 /* using MMX here to free up regular registers for other things */
425 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
426 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
427 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
428 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
429 por_r2r(mm1, mm2); /* src | dst -> mm2 */
430 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
431 } else {
432 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
433 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
434
435 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
436 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
437
438 __asm__ __volatile__ (
439 "movd %0, %%mm4"
440 : : "r" (alpha) ); /* 0000A000 -> mm4 */
441 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
442 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
443 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
444 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
445
446 /* blend */
447 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
448 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
449 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
450 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
451
452 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
453 movd_r2m(mm2, *dstp);/* mm2 -> dst */
454 }
455 ++srcp;
456 ++dstp;
457 }, width);
458 /* *INDENT-ON* */
459 srcp += srcskip;
460 dstp += dstskip;
461 }
462 emms();
463 }
464
465 /* End GCC_ASMBLIT */
466
467 #elif MSVC_ASMBLIT
468 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 202 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
469 static void 203 static void
470 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info) 204 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
471 { 205 {
472 int width = info->d_width; 206 int width = info->d_width;
635 Uint64 multmask; 369 Uint64 multmask;
636 370
637 __m64 src1, dst1, mm_alpha, mm_zero, dmask; 371 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
638 372
639 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 373 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
640 /* *INDENT-OFF* */ 374 multmask = 0xFFFF;
641 multmask = ~(0xFFFFI64 << (ashift * 2)); 375 multmask <<= (ashift * 2);
642 /* *INDENT-ON* */ 376 multmask = ~multmask;
643 dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */ 377 dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
644 378
645 while (height--) { 379 while (height--) {
646 /* *INDENT-OFF* */ 380 /* *INDENT-OFF* */
647 DUFFS_LOOP4({ 381 DUFFS_LOOP4({
681 dstp += dstskip; 415 dstp += dstskip;
682 } 416 }
683 _mm_empty(); 417 _mm_empty();
684 } 418 }
685 419
686 /* End MSVC_ASMBLIT */ 420 #endif /* __MMX__ */
687
688 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
689 421
690 #if SDL_ALTIVEC_BLITTERS 422 #if SDL_ALTIVEC_BLITTERS
691 #if __MWERKS__ 423 #if __MWERKS__
692 #pragma altivec_model on 424 #pragma altivec_model on
693 #endif 425 #endif
1637 srcp += srcskip; 1369 srcp += srcskip;
1638 dstp += dstskip; 1370 dstp += dstskip;
1639 } 1371 }
1640 } 1372 }
1641 1373
1642 #if GCC_ASMBLIT 1374 #ifdef __MMX__
1643 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1644 static void
1645 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
1646 {
1647 int width = info->d_width;
1648 int height = info->d_height;
1649 Uint32 *srcp = (Uint32 *) info->s_pixels;
1650 int srcskip = info->s_skip >> 2;
1651 Uint32 *dstp = (Uint32 *) info->d_pixels;
1652 int dstskip = info->d_skip >> 2;
1653 SDL_PixelFormat *sf = info->src;
1654 Uint32 amask = sf->Amask;
1655
1656 __asm__(
1657 /* make mm6 all zeros. */
1658 "pxor %%mm6, %%mm6\n"
1659 /* Make a mask to preserve the alpha. */
1660 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1661 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1662 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1663 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1664 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1665 /* form channel masks */
1666 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1667 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1668 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1669 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1670 /* get alpha channel shift */
1671 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1672 : /* nothing */ : "rm"(amask), "rm"((Uint32) sf->Ashift));
1673
1674 while (height--) {
1675
1676 /* *INDENT-OFF* */
1677 DUFFS_LOOP4({
1678 Uint32 alpha;
1679
1680 __asm__ (
1681 "prefetch 64(%0)\n"
1682 "prefetch 64(%1)\n"
1683 : : "r" (srcp), "r" (dstp) );
1684
1685 alpha = *srcp & amask;
1686 /* FIXME: Here we special-case opaque alpha since the
1687 compositioning used (>>8 instead of /255) doesn't handle
1688 it correctly. Also special-case alpha=0 for speed?
1689 Benchmark this! */
1690 if(alpha == 0) {
1691 /* do nothing */
1692 }
1693 else if(alpha == amask) {
1694 /* opaque alpha -- copy RGB, keep dst alpha */
1695 /* using MMX here to free up regular registers for other things */
1696 __asm__ (
1697 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1698 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1699 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1700 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1701 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1702 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1703
1704 : : "r" (srcp), "r" (dstp) );
1705 }
1706
1707 else {
1708 __asm__ (
1709 /* load in the source, and dst. */
1710 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */
1711 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */
1712
1713 /* Move the src alpha into mm2 */
1714
1715 /* if supporting pshufw */
1716 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1717 /*"psrlw $8, %%mm2\n" */
1718
1719 /* else: */
1720 "movd %2, %%mm2\n"
1721 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1722 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1723 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1724 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1725
1726 /* move the colors into words. */
1727 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1728 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1729
1730 /* src - dst */
1731 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1732
1733 /* A * (src-dst) */
1734 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1735 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1736 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1737
1738 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1739
1740 "movd %%mm0, (%1)\n" /* result in mm0 */
1741
1742 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1743
1744 }
1745 ++srcp;
1746 ++dstp;
1747 }, width);
1748 /* *INDENT-ON* */
1749 srcp += srcskip;
1750 dstp += dstskip;
1751 }
1752
1753 __asm__("emms\n":);
1754 }
1755
1756 /* End GCC_ASMBLIT*/
1757
1758 #elif MSVC_ASMBLIT
1759 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ 1375 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1760 static void 1376 static void
1761 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info) 1377 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
1762 { 1378 {
1763 int width = info->d_width; 1379 int width = info->d_width;
1773 Uint64 multmask; 1389 Uint64 multmask;
1774 1390
1775 __m64 src1, dst1, mm_alpha, mm_zero, dmask; 1391 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1776 1392
1777 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ 1393 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1778 /* *INDENT-OFF* */ 1394 multmask = 0xFFFF;
1779 multmask = ~(0xFFFFI64 << (ashift * 2)); 1395 multmask <<= (ashift * 2);
1780 /* *INDENT-ON* */ 1396 multmask = ~multmask;
1781 dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */ 1397 dmask = *(__m64 *) & multmask; /* dst alpha mask -> dmask */
1782 1398
1783 while (height--) { 1399 while (height--) {
1784 /* *INDENT-OFF* */ 1400 /* *INDENT-OFF* */
1785 DUFFS_LOOP4({ 1401 DUFFS_LOOP4({
1824 dstp += dstskip; 1440 dstp += dstskip;
1825 } 1441 }
1826 _mm_empty(); 1442 _mm_empty();
1827 } 1443 }
1828 1444
1829 /* End MSVC_ASMBLIT */ 1445 #endif /* __MMX__ */
1830
1831 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1832 1446
1833 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ 1447 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1834 1448
1835 /* blend a single 16 bit pixel at 50% */ 1449 /* blend a single 16 bit pixel at 50% */
1836 #define BLEND16_50(d, s, mask) \ 1450 #define BLEND16_50(d, s, mask) \
1938 dstp += dstskip; 1552 dstp += dstskip;
1939 } 1553 }
1940 } 1554 }
1941 } 1555 }
1942 1556
1943 #if GCC_ASMBLIT 1557 #ifdef __MMX__
1944 /* fast RGB565->RGB565 blending with surface alpha */ 1558
1945 static void
1946 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
1947 {
1948 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1949 if (alpha == 128) {
1950 Blit16to16SurfaceAlpha128(info, 0xf7de);
1951 } else {
1952 int width = info->d_width;
1953 int height = info->d_height;
1954 Uint16 *srcp = (Uint16 *) info->s_pixels;
1955 int srcskip = info->s_skip >> 1;
1956 Uint16 *dstp = (Uint16 *) info->d_pixels;
1957 int dstskip = info->d_skip >> 1;
1958 Uint32 s, d;
1959 Uint8 load[8];
1960
1961 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
1962 *(Uint64 *) load = alpha;
1963 alpha >>= 3; /* downscale alpha to 5 bits */
1964
1965 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
1966 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1967 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1968 /* position alpha to allow for mullo and mulhi on diff channels
1969 to reduce the number of operations */
1970 psllq_i2r(3, mm0);
1971
1972 /* Setup the 565 color channel masks */
1973 *(Uint64 *) load = 0x07E007E007E007E0ULL;
1974 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
1975 *(Uint64 *) load = 0x001F001F001F001FULL;
1976 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
1977 while (height--) {
1978 /* *INDENT-OFF* */
1979 DUFFS_LOOP_QUATRO2(
1980 {
1981 s = *srcp++;
1982 d = *dstp;
1983 /*
1984 * shift out the middle component (green) to
1985 * the high 16 bits, and process all three RGB
1986 * components at the same time.
1987 */
1988 s = (s | s << 16) & 0x07e0f81f;
1989 d = (d | d << 16) & 0x07e0f81f;
1990 d += (s - d) * alpha >> 5;
1991 d &= 0x07e0f81f;
1992 *dstp++ = d | d >> 16;
1993 },{
1994 s = *srcp++;
1995 d = *dstp;
1996 /*
1997 * shift out the middle component (green) to
1998 * the high 16 bits, and process all three RGB
1999 * components at the same time.
2000 */
2001 s = (s | s << 16) & 0x07e0f81f;
2002 d = (d | d << 16) & 0x07e0f81f;
2003 d += (s - d) * alpha >> 5;
2004 d &= 0x07e0f81f;
2005 *dstp++ = d | d >> 16;
2006 s = *srcp++;
2007 d = *dstp;
2008 /*
2009 * shift out the middle component (green) to
2010 * the high 16 bits, and process all three RGB
2011 * components at the same time.
2012 */
2013 s = (s | s << 16) & 0x07e0f81f;
2014 d = (d | d << 16) & 0x07e0f81f;
2015 d += (s - d) * alpha >> 5;
2016 d &= 0x07e0f81f;
2017 *dstp++ = d | d >> 16;
2018 },{
2019 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2020 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2021
2022 /* red -- does not need a mask since the right shift clears
2023 the uninteresting bits */
2024 movq_r2r(mm2, mm5); /* src -> mm5 */
2025 movq_r2r(mm3, mm6); /* dst -> mm6 */
2026 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
2027 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
2028
2029 /* blend */
2030 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2031 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2032 /* alpha used is actually 11 bits
2033 11 + 5 = 16 bits, so the sign bits are lost */
2034 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2035 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2036 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
2037
2038 movq_r2r(mm6, mm1); /* save new reds in dsts */
2039
2040 /* green -- process the bits in place */
2041 movq_r2r(mm2, mm5); /* src -> mm5 */
2042 movq_r2r(mm3, mm6); /* dst -> mm6 */
2043 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2044 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2045
2046 /* blend */
2047 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2048 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2049 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
2050 bits are gone and the sign bits present */
2051 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2052 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2053
2054 por_r2r(mm6, mm1); /* save new greens in dsts */
2055
2056 /* blue */
2057 movq_r2r(mm2, mm5); /* src -> mm5 */
2058 movq_r2r(mm3, mm6); /* dst -> mm6 */
2059 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2060 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2061
2062 /* blend */
2063 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2064 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2065 /* 11 + 5 = 16 bits, so the sign bits are lost and
2066 the interesting bits will need to be MASKed */
2067 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2068 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2069 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2070
2071 por_r2r(mm6, mm1); /* save new blues in dsts */
2072
2073 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
2074
2075 srcp += 4;
2076 dstp += 4;
2077 }, width);
2078 /* *INDENT-ON* */
2079 srcp += srcskip;
2080 dstp += dstskip;
2081 }
2082 emms();
2083 }
2084 }
2085
2086 /* fast RGB555->RGB555 blending with surface alpha */
2087 static void
2088 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
2089 {
2090 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
2091 if (alpha == 128) {
2092 Blit16to16SurfaceAlpha128(info, 0xfbde);
2093 } else {
2094 int width = info->d_width;
2095 int height = info->d_height;
2096 Uint16 *srcp = (Uint16 *) info->s_pixels;
2097 int srcskip = info->s_skip >> 1;
2098 Uint16 *dstp = (Uint16 *) info->d_pixels;
2099 int dstskip = info->d_skip >> 1;
2100 Uint32 s, d;
2101 Uint8 load[8];
2102
2103 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
2104 *(Uint64 *) load = alpha;
2105 alpha >>= 3; /* downscale alpha to 5 bits */
2106
2107 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
2108 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
2109 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
2110 /* position alpha to allow for mullo and mulhi on diff channels
2111 to reduce the number of operations */
2112 psllq_i2r(3, mm0);
2113
2114 /* Setup the 555 color channel masks */
2115 *(Uint64 *) load = 0x03E003E003E003E0ULL;
2116 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
2117 *(Uint64 *) load = 0x001F001F001F001FULL;
2118 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
2119 while (height--) {
2120 /* *INDENT-OFF* */
2121 DUFFS_LOOP_QUATRO2(
2122 {
2123 s = *srcp++;
2124 d = *dstp;
2125 /*
2126 * shift out the middle component (green) to
2127 * the high 16 bits, and process all three RGB
2128 * components at the same time.
2129 */
2130 s = (s | s << 16) & 0x03e07c1f;
2131 d = (d | d << 16) & 0x03e07c1f;
2132 d += (s - d) * alpha >> 5;
2133 d &= 0x03e07c1f;
2134 *dstp++ = d | d >> 16;
2135 },{
2136 s = *srcp++;
2137 d = *dstp;
2138 /*
2139 * shift out the middle component (green) to
2140 * the high 16 bits, and process all three RGB
2141 * components at the same time.
2142 */
2143 s = (s | s << 16) & 0x03e07c1f;
2144 d = (d | d << 16) & 0x03e07c1f;
2145 d += (s - d) * alpha >> 5;
2146 d &= 0x03e07c1f;
2147 *dstp++ = d | d >> 16;
2148 s = *srcp++;
2149 d = *dstp;
2150 /*
2151 * shift out the middle component (green) to
2152 * the high 16 bits, and process all three RGB
2153 * components at the same time.
2154 */
2155 s = (s | s << 16) & 0x03e07c1f;
2156 d = (d | d << 16) & 0x03e07c1f;
2157 d += (s - d) * alpha >> 5;
2158 d &= 0x03e07c1f;
2159 *dstp++ = d | d >> 16;
2160 },{
2161 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
2162 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
2163
2164 /* red -- process the bits in place */
2165 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
2166 /* by reusing the GREEN mask we free up another mmx
2167 register to accumulate the result */
2168
2169 movq_r2r(mm2, mm5); /* src -> mm5 */
2170 movq_r2r(mm3, mm6); /* dst -> mm6 */
2171 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
2172 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
2173
2174 /* blend */
2175 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2176 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2177 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
2178 cleared by a MASK below */
2179 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2180 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2181 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
2182
2183 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
2184
2185 movq_r2r(mm6, mm1); /* save new reds in dsts */
2186
2187 /* green -- process the bits in place */
2188 movq_r2r(mm2, mm5); /* src -> mm5 */
2189 movq_r2r(mm3, mm6); /* dst -> mm6 */
2190 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
2191 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
2192
2193 /* blend */
2194 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2195 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2196 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
2197 bits are gone and the sign bits present */
2198 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
2199 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2200
2201 por_r2r(mm6, mm1); /* save new greens in dsts */
2202
2203 /* blue */
2204 movq_r2r(mm2, mm5); /* src -> mm5 */
2205 movq_r2r(mm3, mm6); /* dst -> mm6 */
2206 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
2207 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
2208
2209 /* blend */
2210 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
2211 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
2212 /* 11 + 5 = 16 bits, so the sign bits are lost and
2213 the interesting bits will need to be MASKed */
2214 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
2215 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
2216 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
2217
2218 por_r2r(mm6, mm1); /* save new blues in dsts */
2219
2220 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
2221
2222 srcp += 4;
2223 dstp += 4;
2224 }, width);
2225 /* *INDENT-ON* */
2226 srcp += srcskip;
2227 dstp += dstskip;
2228 }
2229 emms();
2230 }
2231 }
2232
2233 /* End GCC_ASMBLIT */
2234
2235 #elif MSVC_ASMBLIT
2236 /* fast RGB565->RGB565 blending with surface alpha */ 1559 /* fast RGB565->RGB565 blending with surface alpha */
2237 static void 1560 static void
2238 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info) 1561 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
2239 { 1562 {
2240 unsigned alpha = info->src->alpha; 1563 unsigned alpha = info->src->alpha;
2505 dstp += dstskip; 1828 dstp += dstskip;
2506 } 1829 }
2507 _mm_empty(); 1830 _mm_empty();
2508 } 1831 }
2509 } 1832 }
2510 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ 1833
1834 #endif /* __MMX__ */
2511 1835
2512 /* fast RGB565->RGB565 blending with surface alpha */ 1836 /* fast RGB565->RGB565 blending with surface alpha */
2513 static void 1837 static void
2514 Blit565to565SurfaceAlpha(SDL_BlitInfo * info) 1838 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
2515 { 1839 {
2850 return BlitNto1SurfaceAlpha; 2174 return BlitNto1SurfaceAlpha;
2851 2175
2852 case 2: 2176 case 2:
2853 if (surface->map->identity) { 2177 if (surface->map->identity) {
2854 if (df->Gmask == 0x7e0) { 2178 if (df->Gmask == 0x7e0) {
2855 #if MMX_ASMBLIT 2179 #ifdef __MMX__
2856 if (SDL_HasMMX()) 2180 if (SDL_HasMMX())
2857 return Blit565to565SurfaceAlphaMMX; 2181 return Blit565to565SurfaceAlphaMMX;
2858 else 2182 else
2859 #endif 2183 #endif
2860 return Blit565to565SurfaceAlpha; 2184 return Blit565to565SurfaceAlpha;
2861 } else if (df->Gmask == 0x3e0) { 2185 } else if (df->Gmask == 0x3e0) {
2862 #if MMX_ASMBLIT 2186 #ifdef __MMX__
2863 if (SDL_HasMMX()) 2187 if (SDL_HasMMX())
2864 return Blit555to555SurfaceAlphaMMX; 2188 return Blit555to555SurfaceAlphaMMX;
2865 else 2189 else
2866 #endif 2190 #endif
2867 return Blit555to555SurfaceAlpha; 2191 return Blit555to555SurfaceAlpha;
2871 2195
2872 case 4: 2196 case 4:
2873 if (sf->Rmask == df->Rmask 2197 if (sf->Rmask == df->Rmask
2874 && sf->Gmask == df->Gmask 2198 && sf->Gmask == df->Gmask
2875 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { 2199 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
2876 #if MMX_ASMBLIT 2200 #ifdef __MMX__
2877 if (sf->Rshift % 8 == 0 2201 if (sf->Rshift % 8 == 0
2878 && sf->Gshift % 8 == 0 2202 && sf->Gshift % 8 == 0
2879 && sf->Bshift % 8 == 0 && SDL_HasMMX()) 2203 && sf->Bshift % 8 == 0 && SDL_HasMMX())
2880 return BlitRGBtoRGBSurfaceAlphaMMX; 2204 return BlitRGBtoRGBSurfaceAlphaMMX;
2881 #endif 2205 #endif
2926 2250
2927 case 4: 2251 case 4:
2928 if (sf->Rmask == df->Rmask 2252 if (sf->Rmask == df->Rmask
2929 && sf->Gmask == df->Gmask 2253 && sf->Gmask == df->Gmask
2930 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) { 2254 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
2931 #if MMX_ASMBLIT 2255 #ifdef __MMX__
2932 if (sf->Rshift % 8 == 0 2256 if (sf->Rshift % 8 == 0
2933 && sf->Gshift % 8 == 0 2257 && sf->Gshift % 8 == 0
2934 && sf->Bshift % 8 == 0 2258 && sf->Bshift % 8 == 0
2935 && sf->Ashift % 8 == 0 && sf->Aloss == 0) { 2259 && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
2936 if (SDL_Has3DNow()) 2260 if (SDL_Has3DNow())