Mercurial > sdl-ios-xcode
comparison src/video/SDL_blit_A.c @ 689:5bb080d35049
Date: Tue, 19 Aug 2003 17:57:00 +0200
From: Stephane Marchesin
Subject: Re: [SDL] [patch] MMX alpha blit patches with MMX detection
I think everything is correct now. I've done as much testing as I could,
but some real-world testing wouldn't hurt, I think.
The patch is here : http://icps.u-strasbg.fr/~marchesin/sdl_mmxblit.patch
If you do byte-by-byte comparison of the output between C and MMX
functions, you'll notice that the results for 555 and 565 RGB alpha
blits aren't exactly the same. This is because MMX functions for 555 and
565 RGB have an higher accuracy. If you want the exact same behaviour
that's possible by masking the three lower alpha bits in the MMX
functions. Just ask !
I removed one MMX function because after I fixed it to match its C
equivalent, it revealed to be slower than the C version on a PIII
(although a bit faster on an Athlon XP).
I've also added MMX and PIII replacements for SDL_memcpy. Those provide
some speed up in testvidinfo -benchmark (at least for me, under linux &
X11).
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Fri, 22 Aug 2003 05:51:19 +0000 |
parents | f6ffac90895c |
children | f90d80d68071 |
comparison
equal
deleted
inserted
replaced
688:c0522010bb6d | 689:5bb080d35049 |
---|---|
28 #include <stdio.h> | 28 #include <stdio.h> |
29 | 29 |
30 #include "SDL_types.h" | 30 #include "SDL_types.h" |
31 #include "SDL_video.h" | 31 #include "SDL_video.h" |
32 #include "SDL_blit.h" | 32 #include "SDL_blit.h" |
33 | |
34 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT) | |
35 #include "mmx.h" | |
36 /* Function to check the CPU flags */ | |
37 #define MMX_CPU 0x800000 | |
38 #define TDNOW_CPU 0x80000000 | |
39 #define CPU_Flags() Hermes_X86_CPU() | |
40 #define X86_ASSEMBLER | |
41 #define HermesConverterInterface void | |
42 #define HermesClearInterface void | |
43 #define STACKCALL | |
44 #include "HeadX86.h" | |
45 #endif | |
33 | 46 |
34 /* Functions to perform alpha blended blitting */ | 47 /* Functions to perform alpha blended blitting */ |
35 | 48 |
36 /* N->1 blending with per-surface alpha */ | 49 /* N->1 blending with per-surface alpha */ |
37 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info) | 50 static void BlitNto1SurfaceAlpha(SDL_BlitInfo *info) |
193 src += srcskip; | 206 src += srcskip; |
194 dst += dstskip; | 207 dst += dstskip; |
195 } | 208 } |
196 } | 209 } |
197 | 210 |
211 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT) | |
212 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ | |
213 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) | |
214 { | |
215 int width = info->d_width; | |
216 int height = info->d_height; | |
217 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
218 int srcskip = info->s_skip >> 2; | |
219 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
220 int dstskip = info->d_skip >> 2; | |
221 Uint8 load[8]; | |
222 | |
223 *(Uint64 *)load = 0x00fefefe00fefefe;/* alpha128 mask */ | |
224 movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */ | |
225 *(Uint64 *)load = 0x0001010100010101;/* !alpha128 mask */ | |
226 movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */ | |
227 *(Uint64 *)load = 0xFF000000FF000000;/* dst alpha mask */ | |
228 movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */ | |
229 while(height--) { | |
230 DUFFS_LOOP_DOUBLE2( | |
231 { | |
232 Uint32 s = *srcp++; | |
233 Uint32 d = *dstp; | |
234 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) | |
235 + (s & d & 0x00010101)) | 0xff000000; | |
236 },{ | |
237 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ | |
238 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ | |
239 | |
240 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ | |
241 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ | |
242 | |
243 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ | |
244 pand_r2r(mm4, mm5); /* src & mask -> mm5 */ | |
245 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ | |
246 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ | |
247 | |
248 pand_r2r(mm1, mm2); /* src & dst -> mm2 */ | |
249 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ | |
250 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ | |
251 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ | |
252 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ | |
253 dstp += 2; | |
254 srcp += 2; | |
255 }, width); | |
256 srcp += srcskip; | |
257 dstp += dstskip; | |
258 } | |
259 emms(); | |
260 } | |
261 | |
262 /* fast RGB888->(A)RGB888 blending with surface alpha */ | |
263 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) | |
264 { | |
265 unsigned alpha = info->src->alpha; | |
266 if(alpha == 128) { | |
267 BlitRGBtoRGBSurfaceAlpha128MMX(info); | |
268 } else { | |
269 int width = info->d_width; | |
270 int height = info->d_height; | |
271 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
272 int srcskip = info->s_skip >> 2; | |
273 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
274 int dstskip = info->d_skip >> 2; | |
275 Uint8 load[8] = {alpha, alpha, alpha, alpha, | |
276 alpha, alpha, alpha, alpha}; | |
277 | |
278 movq_m2r(*load, mm4); /* alpha -> mm4 */ | |
279 *(Uint64 *)load = 0x00FF00FF00FF00FF; | |
280 movq_m2r(*load, mm3); /* mask -> mm3 */ | |
281 pand_r2r(mm3, mm4); /* mm4 & mask -> 0A0A0A0A -> mm4 */ | |
282 *(Uint64 *)load = 0xFF000000FF000000;/* dst alpha mask */ | |
283 movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */ | |
284 | |
285 while(height--) { | |
286 DUFFS_LOOP_DOUBLE2({ | |
287 /* One Pixel Blend */ | |
288 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ | |
289 punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */ | |
290 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */ | |
291 | |
292 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ | |
293 movq_r2r(mm2, mm6);/* dst(ARGB) -> mm6 (0000ARGB)*/ | |
294 punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */ | |
295 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | |
296 | |
297 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ | |
298 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ | |
299 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ | |
300 paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ | |
301 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | |
302 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ | |
303 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ | |
304 movd_r2m(mm2, *dstp);/* mm2 -> pixel */ | |
305 ++srcp; | |
306 ++dstp; | |
307 },{ | |
308 /* Two Pixels Blend */ | |
309 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/ | |
310 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ | |
311 punpcklbw_r2r(mm0, mm0); /* low - AARRGGBB -> mm0 */ | |
312 pand_r2r(mm3, mm0); /* 0A0R0G0B -> mm0(src1) */ | |
313 punpckhbw_r2r(mm1, mm1); /* high - AARRGGBB -> mm1 */ | |
314 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1(src2) */ | |
315 | |
316 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ | |
317 movq_r2r(mm2, mm5); /* 2 x dst -> mm5(ARGBARGB) */ | |
318 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ | |
319 punpcklbw_r2r(mm2, mm2); /* low - AARRGGBB -> mm2 */ | |
320 punpckhbw_r2r(mm6, mm6); /* high - AARRGGBB -> mm6 */ | |
321 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2(dst1) */ | |
322 | |
323 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ | |
324 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ | |
325 pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6(dst2) */ | |
326 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ | |
327 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ | |
328 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ | |
329 paddw_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ | |
330 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm0 */ | |
331 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | |
332 paddw_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ | |
333 pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6 */ | |
334 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ | |
335 packuswb_r2r(mm6, mm6); /* ARGBARGB -> mm6 */ | |
336 psrlq_i2r(32, mm2); /* mm2 >> 32 -> mm2 */ | |
337 psllq_i2r(32, mm6); /* mm6 << 32 -> mm6 */ | |
338 por_r2r(mm6, mm2); /* mm6 | mm2 -> mm2 */ | |
339 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ | |
340 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */ | |
341 srcp += 2; | |
342 dstp += 2; | |
343 }, width); | |
344 srcp += srcskip; | |
345 dstp += dstskip; | |
346 } | |
347 emms(); | |
348 } | |
349 } | |
350 | |
351 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ | |
352 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) | |
353 { | |
354 int width = info->d_width; | |
355 int height = info->d_height; | |
356 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
357 int srcskip = info->s_skip >> 2; | |
358 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
359 int dstskip = info->d_skip >> 2; | |
360 Uint32 alpha = 0; | |
361 Uint8 load[8]; | |
362 | |
363 *(Uint64 *)load = 0x00FF00FF00FF00FF; | |
364 movq_m2r(*load, mm3); /* mask -> mm2 */ | |
365 *(Uint64 *)load = 0x00FF000000000000; | |
366 movq_m2r(*load, mm7); /* dst alpha mask -> mm2 */ | |
367 *(Uint64 *)load = 0x00FFFFFF00FFFFFF; | |
368 movq_m2r(*load, mm0); /* alpha 255 mask -> mm0 */ | |
369 *(Uint64 *)load = 0xFF000000FF000000; | |
370 movq_m2r(*load, mm6); /* alpha 255 !mask -> mm6 */ | |
371 while(height--) { | |
372 DUFFS_LOOP4({ | |
373 alpha = *srcp; | |
374 alpha >>= 24; | |
375 /* FIXME: Here we special-case opaque alpha since the | |
376 compositioning used (>>8 instead of /255) doesn't handle | |
377 it correctly. Also special-case alpha=0 for speed? | |
378 Benchmark this! */ | |
379 if(alpha) { | |
380 if(alpha == SDL_ALPHA_OPAQUE) { | |
381 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ | |
382 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ | |
383 pand_r2r(mm0, mm1); | |
384 pand_r2r(mm6, mm2); | |
385 por_r2r(mm1, mm2); | |
386 movd_r2m(mm2, (*dstp)); | |
387 } else { | |
388 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ | |
389 punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */ | |
390 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */ | |
391 | |
392 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ | |
393 punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */ | |
394 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | |
395 | |
396 movq_r2r(mm2, mm5);/* mm2(0A0R0G0B) -> mm5 */ | |
397 pand_r2r(mm7, mm5); /* mm5 & dst alpha mask -> mm5(0A000000) */ | |
398 psrlq_i2r(24, mm5); /* mm5 >> 24 -> mm5 (0000A000)*/ | |
399 | |
400 movq_r2r(mm1, mm4);/* mm1(0A0R0G0B) -> mm4 */ | |
401 psrlq_i2r(48, mm4); /* mm4 >> 48 -> mm4(0000000A) */ | |
402 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ | |
403 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ | |
404 | |
405 /* blend */ | |
406 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ | |
407 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ | |
408 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ | |
409 paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ | |
410 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | |
411 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ | |
412 pand_r2r(mm0, mm2); /* 0RGB0RGB -> mm2 */ | |
413 por_r2r(mm5, mm2); /* dst alpha | mm2 -> mm2 */ | |
414 movd_r2m(mm2, *dstp);/* mm2 -> dst */ | |
415 } | |
416 } | |
417 ++srcp; | |
418 ++dstp; | |
419 }, width); | |
420 srcp += srcskip; | |
421 dstp += dstskip; | |
422 } | |
423 emms(); | |
424 } | |
425 #endif | |
426 | |
198 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ | 427 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ |
199 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) | 428 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info) |
200 { | 429 { |
201 int width = info->d_width; | 430 int width = info->d_width; |
202 int height = info->d_height; | 431 int height = info->d_height; |
228 int height = info->d_height; | 457 int height = info->d_height; |
229 Uint32 *srcp = (Uint32 *)info->s_pixels; | 458 Uint32 *srcp = (Uint32 *)info->s_pixels; |
230 int srcskip = info->s_skip >> 2; | 459 int srcskip = info->s_skip >> 2; |
231 Uint32 *dstp = (Uint32 *)info->d_pixels; | 460 Uint32 *dstp = (Uint32 *)info->d_pixels; |
232 int dstskip = info->d_skip >> 2; | 461 int dstskip = info->d_skip >> 2; |
462 Uint32 s; | |
463 Uint32 d; | |
464 Uint32 s1; | |
465 Uint32 d1; | |
233 | 466 |
234 while(height--) { | 467 while(height--) { |
235 DUFFS_LOOP4({ | 468 DUFFS_LOOP_DOUBLE2({ |
236 Uint32 s; | 469 /* One Pixel Blend */ |
237 Uint32 d; | |
238 Uint32 s1; | |
239 Uint32 d1; | |
240 s = *srcp; | 470 s = *srcp; |
241 d = *dstp; | 471 d = *dstp; |
242 s1 = s & 0xff00ff; | 472 s1 = s & 0xff00ff; |
243 d1 = d & 0xff00ff; | 473 d1 = d & 0xff00ff; |
244 d1 = (d1 + ((s1 - d1) * alpha >> 8)) | 474 d1 = (d1 + ((s1 - d1) * alpha >> 8)) |
247 d &= 0xff00; | 477 d &= 0xff00; |
248 d = (d + ((s - d) * alpha >> 8)) & 0xff00; | 478 d = (d + ((s - d) * alpha >> 8)) & 0xff00; |
249 *dstp = d1 | d | 0xff000000; | 479 *dstp = d1 | d | 0xff000000; |
250 ++srcp; | 480 ++srcp; |
251 ++dstp; | 481 ++dstp; |
482 },{ | |
483 /* Two Pixels Blend */ | |
484 s = *srcp; | |
485 d = *dstp; | |
486 s1 = s & 0xff00ff; | |
487 d1 = d & 0xff00ff; | |
488 d1 += (s1 - d1) * alpha >> 8; | |
489 d1 &= 0xff00ff; | |
490 | |
491 s = ((s & 0xff00) >> 8) | | |
492 ((srcp[1] & 0xff00) << 8); | |
493 d = ((d & 0xff00) >> 8) | | |
494 ((dstp[1] & 0xff00) << 8); | |
495 d += (s - d) * alpha >> 8; | |
496 d &= 0x00ff00ff; | |
497 | |
498 *dstp++ = d1 | ((d << 8) & 0xff00) | 0xff000000; | |
499 ++srcp; | |
500 | |
501 s1 = *srcp; | |
502 d1 = *dstp; | |
503 s1 &= 0xff00ff; | |
504 d1 &= 0xff00ff; | |
505 d1 += (s1 - d1) * alpha >> 8; | |
506 d1 &= 0xff00ff; | |
507 | |
508 *dstp = d1 | ((d >> 8) & 0xff00) | 0xff000000; | |
509 ++srcp; | |
510 ++dstp; | |
252 }, width); | 511 }, width); |
253 srcp += srcskip; | 512 srcp += srcskip; |
254 dstp += dstskip; | 513 dstp += dstskip; |
255 } | 514 } |
256 } | 515 } |
276 Uint32 alpha = s >> 24; | 535 Uint32 alpha = s >> 24; |
277 /* FIXME: Here we special-case opaque alpha since the | 536 /* FIXME: Here we special-case opaque alpha since the |
278 compositioning used (>>8 instead of /255) doesn't handle | 537 compositioning used (>>8 instead of /255) doesn't handle |
279 it correctly. Also special-case alpha=0 for speed? | 538 it correctly. Also special-case alpha=0 for speed? |
280 Benchmark this! */ | 539 Benchmark this! */ |
281 if(alpha == SDL_ALPHA_OPAQUE) { | 540 if(alpha) { |
541 if(alpha == SDL_ALPHA_OPAQUE) { | |
282 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); | 542 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); |
283 } else { | 543 } else { |
284 /* | 544 /* |
285 * take out the middle component (green), and process | 545 * take out the middle component (green), and process |
286 * the other two in parallel. One multiply less. | 546 * the other two in parallel. One multiply less. |
287 */ | 547 */ |
288 d = *dstp; | 548 d = *dstp; |
292 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; | 552 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; |
293 s &= 0xff00; | 553 s &= 0xff00; |
294 d &= 0xff00; | 554 d &= 0xff00; |
295 d = (d + ((s - d) * alpha >> 8)) & 0xff00; | 555 d = (d + ((s - d) * alpha >> 8)) & 0xff00; |
296 *dstp = d1 | d | dalpha; | 556 *dstp = d1 | d | dalpha; |
557 } | |
297 } | 558 } |
298 ++srcp; | 559 ++srcp; |
299 ++dstp; | 560 ++dstp; |
300 }, width); | 561 }, width); |
301 srcp += srcskip; | 562 srcp += srcskip; |
302 dstp += dstskip; | 563 dstp += dstskip; |
303 } | 564 } |
304 } | 565 } |
566 | |
567 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT) | |
568 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ | |
569 inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) | |
570 { | |
571 int width = info->d_width; | |
572 int height = info->d_height; | |
573 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
574 int srcskip = info->s_skip >> 2; | |
575 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
576 int dstskip = info->d_skip >> 2; | |
577 | |
578 Uint32 s; | |
579 Uint32 alpha; | |
580 | |
581 __asm__ ( | |
582 /* make mm6 all zeros. */ | |
583 "pxor %%mm6, %%mm6\n" | |
584 | |
585 /* Make a mask to preserve the alpha. */ | |
586 "pcmpeqb %%mm7, %%mm7\n\t" /* mm7(s) = FF FF FF FF | FF FF FF FF */ | |
587 "psrlq $16, %%mm7\n\t" /* mm7(s) = 00 00 FF FF | FF FF FF FF */ | |
588 | |
589 : ); | |
590 | |
591 while(height--) { | |
592 | |
593 DUFFS_LOOP4({ | |
594 | |
595 __asm__ ( | |
596 "prefetch 64(%0)\n" | |
597 "prefetch 64(%1)\n" | |
598 : : "r" (srcp), "r" (dstp) ); | |
599 | |
600 s = *srcp; | |
601 alpha = s >> 24; | |
602 /* FIXME: Here we special-case opaque alpha since the | |
603 compositioning used (>>8 instead of /255) doesn't handle | |
604 it correctly. Also special-case alpha=0 for speed? | |
605 Benchmark this! */ | |
606 | |
607 if(alpha == SDL_ALPHA_OPAQUE) { | |
608 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); | |
609 } | |
610 | |
611 else { | |
612 __asm__ ( | |
613 /* load in the source, and dst. */ | |
614 "movd (%0), %%mm0\n" /* mm0(s) = 0 0 0 0 | As Rs Gs Bs */ | |
615 "movd (%1), %%mm1\n" /* mm1(d) = 0 0 0 0 | Ad Rd Gd Bd */ | |
616 | |
617 /* Move the src alpha into mm2 */ | |
618 | |
619 /* if supporting pshufw */ | |
620 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */ | |
621 /*"psrlw $8, %%mm2\n" */ | |
622 | |
623 /* else: */ | |
624 "movq %%mm0, %%mm2\n" | |
625 "psrld $24, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ | |
626 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */ | |
627 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */ | |
628 | |
629 /* move the colors into words. */ | |
630 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */ | |
631 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */ | |
632 | |
633 /* src - dst */ | |
634 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */ | |
635 | |
636 /* A * (src-dst) */ | |
637 "pmullw %%mm2, %%mm0\n" /* mm0 = As*As-d As*Rs-d | As*Gs-d As*Bs-d */ | |
638 "pand %%mm7, %%mm0\n" /* to preserve dest alpha */ | |
639 "psrlw $8, %%mm0\n" /* mm0 = Ac>>8 Rc>>8 | Gc>>8 Bc>>8 */ | |
640 "paddb %%mm1, %%mm0\n" /* mm0 = Ac+Ad Rc+Rd | Gc+Gd Bc+Bd */ | |
641 | |
642 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */ | |
643 | |
644 "movd %%mm0, (%1)\n" /* result in mm0 */ | |
645 | |
646 : : "r" (srcp), "r" (dstp) ); | |
647 | |
648 } | |
649 ++srcp; | |
650 ++dstp; | |
651 }, width); | |
652 srcp += srcskip; | |
653 dstp += dstskip; | |
654 } | |
655 | |
656 __asm__ ( | |
657 "emms\n" | |
658 : ); | |
659 } | |
660 #endif | |
305 | 661 |
306 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ | 662 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ |
307 | 663 |
308 /* blend a single 16 bit pixel at 50% */ | 664 /* blend a single 16 bit pixel at 50% */ |
309 #define BLEND16_50(d, s, mask) \ | 665 #define BLEND16_50(d, s, mask) \ |
407 srcp += srcskip; | 763 srcp += srcskip; |
408 dstp += dstskip; | 764 dstp += dstskip; |
409 } | 765 } |
410 } | 766 } |
411 } | 767 } |
768 | |
769 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT) | |
770 /* fast RGB565->RGB565 blending with surface alpha */ | |
771 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) | |
772 { | |
773 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ | |
774 if(alpha == 128) { | |
775 Blit16to16SurfaceAlpha128(info, 0xf7de); | |
776 } else { | |
777 int width = info->d_width; | |
778 int height = info->d_height; | |
779 Uint16 *srcp = (Uint16 *)info->s_pixels; | |
780 int srcskip = info->s_skip >> 1; | |
781 Uint16 *dstp = (Uint16 *)info->d_pixels; | |
782 int dstskip = info->d_skip >> 1; | |
783 Uint32 s, d; | |
784 Uint8 load[8]; | |
785 | |
786 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | |
787 *(Uint64 *)load = alpha; | |
788 alpha >>= 3; /* downscale alpha to 5 bits */ | |
789 | |
790 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ | |
791 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ | |
792 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ | |
793 | |
794 /* Setup the 565 color channel masks */ | |
795 *(Uint64 *)load = 0xF800F800F800F800; | |
796 movq_m2r(*load, mm1); /* MASKRED -> mm1 */ | |
797 *(Uint64 *)load = 0x07E007E007E007E0; | |
798 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ | |
799 *(Uint64 *)load = 0x001F001F001F001F; | |
800 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ | |
801 while(height--) { | |
802 DUFFS_LOOP_QUATRO2( | |
803 { | |
804 s = *srcp++; | |
805 d = *dstp; | |
806 /* | |
807 * shift out the middle component (green) to | |
808 * the high 16 bits, and process all three RGB | |
809 * components at the same time. | |
810 */ | |
811 s = (s | s << 16) & 0x07e0f81f; | |
812 d = (d | d << 16) & 0x07e0f81f; | |
813 d += (s - d) * alpha >> 5; | |
814 d &= 0x07e0f81f; | |
815 *dstp++ = d | d >> 16; | |
816 },{ | |
817 s = *srcp++; | |
818 d = *dstp; | |
819 /* | |
820 * shift out the middle component (green) to | |
821 * the high 16 bits, and process all three RGB | |
822 * components at the same time. | |
823 */ | |
824 s = (s | s << 16) & 0x07e0f81f; | |
825 d = (d | d << 16) & 0x07e0f81f; | |
826 d += (s - d) * alpha >> 5; | |
827 d &= 0x07e0f81f; | |
828 *dstp++ = d | d >> 16; | |
829 s = *srcp++; | |
830 d = *dstp; | |
831 /* | |
832 * shift out the middle component (green) to | |
833 * the high 16 bits, and process all three RGB | |
834 * components at the same time. | |
835 */ | |
836 s = (s | s << 16) & 0x07e0f81f; | |
837 d = (d | d << 16) & 0x07e0f81f; | |
838 d += (s - d) * alpha >> 5; | |
839 d &= 0x07e0f81f; | |
840 *dstp++ = d | d >> 16; | |
841 },{ | |
842 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ | |
843 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ | |
844 | |
845 /* RED */ | |
846 movq_r2r(mm2, mm5); /* src -> mm5 */ | |
847 pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */ | |
848 psrlq_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ | |
849 | |
850 movq_r2r(mm3, mm6); /* dst -> mm6 */ | |
851 pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */ | |
852 psrlq_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ | |
853 | |
854 /* blend */ | |
855 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | |
856 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | |
857 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | |
858 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | |
859 psllq_i2r(11, mm6); /* mm6 << 11 -> mm6 */ | |
860 pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */ | |
861 | |
862 movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */ | |
863 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ | |
864 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */ | |
865 por_r2r(mm6, mm3); /* save new reds in dsts */ | |
866 | |
867 /* green */ | |
868 movq_r2r(mm2, mm5); /* src -> mm5 */ | |
869 pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */ | |
870 psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */ | |
871 | |
872 movq_r2r(mm3, mm6); /* dst -> mm6 */ | |
873 pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */ | |
874 psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */ | |
875 | |
876 /* blend */ | |
877 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | |
878 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | |
879 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | |
880 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | |
881 psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */ | |
882 pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */ | |
883 | |
884 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ | |
885 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ | |
886 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */ | |
887 por_r2r(mm6, mm3); /* save new greens in dsts */ | |
888 | |
889 /* blue */ | |
890 movq_r2r(mm2, mm5); /* src -> mm5 */ | |
891 pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */ | |
892 | |
893 movq_r2r(mm3, mm6); /* dst -> mm6 */ | |
894 pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ | |
895 | |
896 /* blend */ | |
897 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | |
898 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | |
899 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | |
900 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | |
901 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */ | |
902 | |
903 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ | |
904 por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */ | |
905 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */ | |
906 por_r2r(mm6, mm3); /* save new blues in dsts */ | |
907 | |
908 movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */ | |
909 | |
910 srcp += 4; | |
911 dstp += 4; | |
912 }, width); | |
913 srcp += srcskip; | |
914 dstp += dstskip; | |
915 } | |
916 emms(); | |
917 } | |
918 } | |
919 | |
920 /* fast RGB555->RGB555 blending with surface alpha */ | |
921 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) | |
922 { | |
923 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ | |
924 if(alpha == 128) { | |
925 Blit16to16SurfaceAlpha128(info, 0xfbde); | |
926 } else { | |
927 int width = info->d_width; | |
928 int height = info->d_height; | |
929 Uint16 *srcp = (Uint16 *)info->s_pixels; | |
930 int srcskip = info->s_skip >> 1; | |
931 Uint16 *dstp = (Uint16 *)info->d_pixels; | |
932 int dstskip = info->d_skip >> 1; | |
933 Uint32 s, d; | |
934 Uint8 load[8]; | |
935 | |
936 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | |
937 *(Uint64 *)load = alpha; | |
938 alpha >>= 3; /* downscale alpha to 5 bits */ | |
939 | |
940 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ | |
941 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ | |
942 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ | |
943 | |
944 /* Setup the 555 color channel masks */ | |
945 *(Uint64 *)load = 0x7C007C007C007C00; | |
946 movq_m2r(*load, mm1); /* MASKRED -> mm1 */ | |
947 *(Uint64 *)load = 0x03E003E003E003E0; | |
948 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ | |
949 *(Uint64 *)load = 0x001F001F001F001F; | |
950 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ | |
951 while(height--) { | |
952 DUFFS_LOOP_QUATRO2( | |
953 { | |
954 s = *srcp++; | |
955 d = *dstp; | |
956 /* | |
957 * shift out the middle component (green) to | |
958 * the high 16 bits, and process all three RGB | |
959 * components at the same time. | |
960 */ | |
961 s = (s | s << 16) & 0x03e07c1f; | |
962 d = (d | d << 16) & 0x03e07c1f; | |
963 d += (s - d) * alpha >> 5; | |
964 d &= 0x03e07c1f; | |
965 *dstp++ = d | d >> 16; | |
966 },{ | |
967 s = *srcp++; | |
968 d = *dstp; | |
969 /* | |
970 * shift out the middle component (green) to | |
971 * the high 16 bits, and process all three RGB | |
972 * components at the same time. | |
973 */ | |
974 s = (s | s << 16) & 0x03e07c1f; | |
975 d = (d | d << 16) & 0x03e07c1f; | |
976 d += (s - d) * alpha >> 5; | |
977 d &= 0x03e07c1f; | |
978 *dstp++ = d | d >> 16; | |
979 s = *srcp++; | |
980 d = *dstp; | |
981 /* | |
982 * shift out the middle component (green) to | |
983 * the high 16 bits, and process all three RGB | |
984 * components at the same time. | |
985 */ | |
986 s = (s | s << 16) & 0x03e07c1f; | |
987 d = (d | d << 16) & 0x03e07c1f; | |
988 d += (s - d) * alpha >> 5; | |
989 d &= 0x03e07c1f; | |
990 *dstp++ = d | d >> 16; | |
991 },{ | |
992 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ | |
993 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ | |
994 | |
995 /* RED */ | |
996 movq_r2r(mm2, mm5); /* src -> mm5 */ | |
997 pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */ | |
998 psrlq_i2r(10, mm5); /* mm5 >> 10 -> mm5 [000r 000r 000r 000r] */ | |
999 | |
1000 movq_r2r(mm3, mm6); /* dst -> mm6 */ | |
1001 pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */ | |
1002 psrlq_i2r(10, mm6); /* mm6 >> 10 -> mm6 [000r 000r 000r 000r] */ | |
1003 | |
1004 /* blend */ | |
1005 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | |
1006 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | |
1007 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | |
1008 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | |
1009 psllq_i2r(10, mm6); /* mm6 << 10 -> mm6 */ | |
1010 pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */ | |
1011 | |
1012 movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */ | |
1013 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ | |
1014 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */ | |
1015 por_r2r(mm6, mm3); /* save new reds in dsts */ | |
1016 | |
1017 /* green */ | |
1018 movq_r2r(mm2, mm5); /* src -> mm5 */ | |
1019 pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */ | |
1020 psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */ | |
1021 | |
1022 movq_r2r(mm3, mm6); /* dst -> mm6 */ | |
1023 pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */ | |
1024 psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */ | |
1025 | |
1026 /* blend */ | |
1027 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | |
1028 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | |
1029 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | |
1030 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | |
1031 psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */ | |
1032 pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */ | |
1033 | |
1034 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ | |
1035 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ | |
1036 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */ | |
1037 por_r2r(mm6, mm3); /* save new greens in dsts */ | |
1038 | |
1039 /* blue */ | |
1040 movq_r2r(mm2, mm5); /* src -> mm5 */ | |
1041 pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */ | |
1042 | |
1043 movq_r2r(mm3, mm6); /* dst -> mm6 */ | |
1044 pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ | |
1045 | |
1046 /* blend */ | |
1047 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | |
1048 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | |
1049 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | |
1050 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | |
1051 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */ | |
1052 | |
1053 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ | |
1054 por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */ | |
1055 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */ | |
1056 por_r2r(mm6, mm3); /* save new blues in dsts */ | |
1057 | |
1058 movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */ | |
1059 | |
1060 srcp += 4; | |
1061 dstp += 4; | |
1062 }, width); | |
1063 srcp += srcskip; | |
1064 dstp += dstskip; | |
1065 } | |
1066 emms(); | |
1067 } | |
1068 } | |
1069 #endif | |
412 | 1070 |
413 /* fast RGB565->RGB565 blending with surface alpha */ | 1071 /* fast RGB565->RGB565 blending with surface alpha */ |
414 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) | 1072 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) |
415 { | 1073 { |
416 unsigned alpha = info->src->alpha; | 1074 unsigned alpha = info->src->alpha; |
498 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */ | 1156 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */ |
499 /* FIXME: Here we special-case opaque alpha since the | 1157 /* FIXME: Here we special-case opaque alpha since the |
500 compositioning used (>>8 instead of /255) doesn't handle | 1158 compositioning used (>>8 instead of /255) doesn't handle |
501 it correctly. Also special-case alpha=0 for speed? | 1159 it correctly. Also special-case alpha=0 for speed? |
502 Benchmark this! */ | 1160 Benchmark this! */ |
503 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { | 1161 if(alpha) { |
1162 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { | |
504 *dstp = (s >> 8 & 0xf800) + (s >> 5 & 0x7e0) | 1163 *dstp = (s >> 8 & 0xf800) + (s >> 5 & 0x7e0) |
505 + (s >> 3 & 0x1f); | 1164 + (s >> 3 & 0x1f); |
506 } else { | 1165 } else { |
507 Uint32 d = *dstp; | 1166 Uint32 d = *dstp; |
508 /* | 1167 /* |
509 * convert source and destination to G0RAB65565 | 1168 * convert source and destination to G0RAB65565 |
510 * and blend all components at the same time | 1169 * and blend all components at the same time |
511 */ | 1170 */ |
513 + (s >> 3 & 0x1f); | 1172 + (s >> 3 & 0x1f); |
514 d = (d | d << 16) & 0x07e0f81f; | 1173 d = (d | d << 16) & 0x07e0f81f; |
515 d += (s - d) * alpha >> 5; | 1174 d += (s - d) * alpha >> 5; |
516 d &= 0x07e0f81f; | 1175 d &= 0x07e0f81f; |
517 *dstp = d | d >> 16; | 1176 *dstp = d | d >> 16; |
1177 } | |
518 } | 1178 } |
519 srcp++; | 1179 srcp++; |
520 dstp++; | 1180 dstp++; |
521 }, width); | 1181 }, width); |
522 srcp += srcskip; | 1182 srcp += srcskip; |
541 alpha = s >> 27; /* downscale alpha to 5 bits */ | 1201 alpha = s >> 27; /* downscale alpha to 5 bits */ |
542 /* FIXME: Here we special-case opaque alpha since the | 1202 /* FIXME: Here we special-case opaque alpha since the |
543 compositioning used (>>8 instead of /255) doesn't handle | 1203 compositioning used (>>8 instead of /255) doesn't handle |
544 it correctly. Also special-case alpha=0 for speed? | 1204 it correctly. Also special-case alpha=0 for speed? |
545 Benchmark this! */ | 1205 Benchmark this! */ |
546 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { | 1206 if(alpha) { |
1207 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) { | |
547 *dstp = (s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) | 1208 *dstp = (s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) |
548 + (s >> 3 & 0x1f); | 1209 + (s >> 3 & 0x1f); |
549 } else { | 1210 } else { |
550 Uint32 d = *dstp; | 1211 Uint32 d = *dstp; |
551 /* | 1212 /* |
552 * convert source and destination to G0RAB65565 | 1213 * convert source and destination to G0RAB65565 |
553 * and blend all components at the same time | 1214 * and blend all components at the same time |
554 */ | 1215 */ |
556 + (s >> 3 & 0x1f); | 1217 + (s >> 3 & 0x1f); |
557 d = (d | d << 16) & 0x03e07c1f; | 1218 d = (d | d << 16) & 0x03e07c1f; |
558 d += (s - d) * alpha >> 5; | 1219 d += (s - d) * alpha >> 5; |
559 d &= 0x03e07c1f; | 1220 d &= 0x03e07c1f; |
560 *dstp = d | d >> 16; | 1221 *dstp = d | d >> 16; |
1222 } | |
561 } | 1223 } |
562 srcp++; | 1224 srcp++; |
563 dstp++; | 1225 dstp++; |
564 }, width); | 1226 }, width); |
565 srcp += srcskip; | 1227 srcp += srcskip; |
581 int srcbpp = srcfmt->BytesPerPixel; | 1243 int srcbpp = srcfmt->BytesPerPixel; |
582 int dstbpp = dstfmt->BytesPerPixel; | 1244 int dstbpp = dstfmt->BytesPerPixel; |
583 unsigned sA = srcfmt->alpha; | 1245 unsigned sA = srcfmt->alpha; |
584 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; | 1246 unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0; |
585 | 1247 |
586 while ( height-- ) { | 1248 if(sA) { |
1249 while ( height-- ) { | |
587 DUFFS_LOOP4( | 1250 DUFFS_LOOP4( |
588 { | 1251 { |
589 Uint32 pixel; | 1252 Uint32 pixel; |
590 unsigned sR; | 1253 unsigned sR; |
591 unsigned sG; | 1254 unsigned sG; |
601 dst += dstbpp; | 1264 dst += dstbpp; |
602 }, | 1265 }, |
603 width); | 1266 width); |
604 src += srcskip; | 1267 src += srcskip; |
605 dst += dstskip; | 1268 dst += dstskip; |
1269 } | |
606 } | 1270 } |
607 } | 1271 } |
608 | 1272 |
609 /* General (slow) colorkeyed N->N blending with per-surface alpha */ | 1273 /* General (slow) colorkeyed N->N blending with per-surface alpha */ |
610 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) | 1274 static void BlitNtoNSurfaceAlphaKey(SDL_BlitInfo *info) |
632 unsigned sB; | 1296 unsigned sB; |
633 unsigned dR; | 1297 unsigned dR; |
634 unsigned dG; | 1298 unsigned dG; |
635 unsigned dB; | 1299 unsigned dB; |
636 RETRIEVE_RGB_PIXEL(src, srcbpp, pixel); | 1300 RETRIEVE_RGB_PIXEL(src, srcbpp, pixel); |
637 if(pixel != ckey) { | 1301 if(sA && pixel != ckey) { |
638 RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); | 1302 RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); |
639 DISEMBLE_RGB(dst, dstbpp, dstfmt, pixel, dR, dG, dB); | 1303 DISEMBLE_RGB(dst, dstbpp, dstfmt, pixel, dR, dG, dB); |
640 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); | 1304 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); |
641 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); | 1305 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); |
642 } | 1306 } |
684 unsigned dG; | 1348 unsigned dG; |
685 unsigned dB; | 1349 unsigned dB; |
686 unsigned sA; | 1350 unsigned sA; |
687 unsigned dA; | 1351 unsigned dA; |
688 DISEMBLE_RGBA(src, srcbpp, srcfmt, pixel, sR, sG, sB, sA); | 1352 DISEMBLE_RGBA(src, srcbpp, srcfmt, pixel, sR, sG, sB, sA); |
689 DISEMBLE_RGBA(dst, dstbpp, dstfmt, pixel, dR, dG, dB, dA); | 1353 if(sA) { |
690 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); | 1354 DISEMBLE_RGBA(dst, dstbpp, dstfmt, pixel, dR, dG, dB, dA); |
691 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); | 1355 ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); |
1356 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA); | |
1357 } | |
692 src += srcbpp; | 1358 src += srcbpp; |
693 dst += dstbpp; | 1359 dst += dstbpp; |
694 }, | 1360 }, |
695 width); | 1361 width); |
696 src += srcskip; | 1362 src += srcskip; |
717 return BlitNto1SurfaceAlpha; | 1383 return BlitNto1SurfaceAlpha; |
718 | 1384 |
719 case 2: | 1385 case 2: |
720 if(surface->map->identity) { | 1386 if(surface->map->identity) { |
721 if(df->Gmask == 0x7e0) | 1387 if(df->Gmask == 0x7e0) |
1388 { | |
1389 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT) | |
1390 if((CPU_Flags()&MMX_CPU)!=0) | |
1391 return Blit565to565SurfaceAlphaMMX; | |
1392 else | |
1393 #endif | |
722 return Blit565to565SurfaceAlpha; | 1394 return Blit565to565SurfaceAlpha; |
1395 } | |
723 else if(df->Gmask == 0x3e0) | 1396 else if(df->Gmask == 0x3e0) |
1397 { | |
1398 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT) | |
1399 if((CPU_Flags()&MMX_CPU)!=0) | |
1400 return Blit555to555SurfaceAlphaMMX; | |
1401 else | |
1402 #endif | |
724 return Blit555to555SurfaceAlpha; | 1403 return Blit555to555SurfaceAlpha; |
1404 } | |
725 } | 1405 } |
726 return BlitNtoNSurfaceAlpha; | 1406 return BlitNtoNSurfaceAlpha; |
727 | 1407 |
728 case 4: | 1408 case 4: |
729 if(sf->Rmask == df->Rmask | 1409 if(sf->Rmask == df->Rmask |
730 && sf->Gmask == df->Gmask | 1410 && sf->Gmask == df->Gmask |
731 && sf->Bmask == df->Bmask | 1411 && sf->Bmask == df->Bmask |
732 && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff | 1412 && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff |
733 && sf->BytesPerPixel == 4) | 1413 && sf->BytesPerPixel == 4) |
1414 { | |
1415 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT) | |
1416 if((CPU_Flags()&MMX_CPU)!=0) | |
1417 return BlitRGBtoRGBSurfaceAlphaMMX; | |
1418 else | |
1419 #endif | |
734 return BlitRGBtoRGBSurfaceAlpha; | 1420 return BlitRGBtoRGBSurfaceAlpha; |
1421 } | |
735 else | 1422 else |
736 return BlitNtoNSurfaceAlpha; | 1423 return BlitNtoNSurfaceAlpha; |
737 | 1424 |
738 case 3: | 1425 case 3: |
739 default: | 1426 default: |
762 if(sf->Amask == 0xff000000 | 1449 if(sf->Amask == 0xff000000 |
763 && sf->Rmask == df->Rmask | 1450 && sf->Rmask == df->Rmask |
764 && sf->Gmask == df->Gmask | 1451 && sf->Gmask == df->Gmask |
765 && sf->Bmask == df->Bmask | 1452 && sf->Bmask == df->Bmask |
766 && sf->BytesPerPixel == 4) | 1453 && sf->BytesPerPixel == 4) |
767 return BlitRGBtoRGBPixelAlpha; | 1454 { |
1455 #if defined(i386) && defined(__GNUC__) && defined(USE_ASMBLIT) | |
1456 Uint32 f; | |
1457 f=CPU_Flags(); | |
1458 if((f&(TDNOW_CPU|MMX_CPU))==(TDNOW_CPU|MMX_CPU)) | |
1459 return BlitRGBtoRGBPixelAlphaMMX3DNOW; | |
1460 else | |
1461 if((f&MMX_CPU)!=0) | |
1462 return BlitRGBtoRGBPixelAlphaMMX; | |
1463 else | |
1464 #endif | |
1465 return BlitRGBtoRGBPixelAlpha; | |
1466 } | |
768 return BlitNtoNPixelAlpha; | 1467 return BlitNtoNPixelAlpha; |
769 | 1468 |
770 case 3: | 1469 case 3: |
771 default: | 1470 default: |
772 return BlitNtoNPixelAlpha; | 1471 return BlitNtoNPixelAlpha; |