comparison src/video/SDL_blit_A.c @ 1542:a8bf1aa21020

Fixed bug #15 SDL_blit_A.mmx-speed.patch.txt -- Speed improvements and a bugfix for the current GCC inline mmx asm code: - Changed some ops and removed some resulting useless ones. - Added some instruction parallelism (some gain) The resulting speed on my Xeon improved upto 35% depending on the function (measured in fps). - Fixed a bug where BlitRGBtoRGBSurfaceAlphaMMX() was setting the alpha component on the destination surfaces (to opaque-alpha) even when the surface had none. SDL_blit_A.mmx-msvc.patch.txt -- MSVC mmx intrinsics version of the same GCC asm code. MSVC compiler tries to parallelize the code and to avoid register stalls, but does not always do a very good job. Per-surface blending MSVC functions run quite a bit faster than their pure-asm counterparts (upto 55% faster for 16bit ones), but the per-pixel blending runs somewhat slower than asm. - BlitRGBtoRGBSurfaceAlphaMMX and BlitRGBtoRGBPixelAlphaMMX (and all variants) can now also handle formats other than (A)RGB8888. Formats like RGBA8888 and some quite exotic ones are allowed -- like RAGB8888, or actually anything having channels aligned on 8bit boundary and full 8bit alpha (for per-pixel alpha blending). The performance cost of this change is virtually 0 for per-surface alpha blending (no extra ops inside the loop) and a single non-MMX op inside the loop for per-pixel blending. In testing, the per-pixel alpha blending takes a ~2% performance hit, but it still runs much faster than the current code in CVS. If necessary, a separate function with this functionality can be made. This code requires Processor Pack for VC6.
author Sam Lantinga <slouken@libsdl.org>
date Wed, 15 Mar 2006 15:39:29 +0000
parents dc6b59e925a2
children 4b835e36633d
comparison
equal deleted inserted replaced
1541:157001382dfd 1542:a8bf1aa21020
22 #include "SDL_config.h" 22 #include "SDL_config.h"
23 23
24 #include "SDL_video.h" 24 #include "SDL_video.h"
25 #include "SDL_blit.h" 25 #include "SDL_blit.h"
26 26
27 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES 27 #if SDL_ASSEMBLY_ROUTINES
28 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
28 #define MMX_ASMBLIT 1 29 #define MMX_ASMBLIT 1
30 #define GCC_ASMBLIT 1
31 #elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86)
32 #define MMX_ASMBLIT 1
33 #define MSVC_ASMBLIT 1
29 #endif 34 #endif
35 #endif /* SDL_ASSEMBLY_ROUTINES */
30 36
31 /* Function to check the CPU flags */ 37 /* Function to check the CPU flags */
32 #include "SDL_cpuinfo.h" 38 #include "SDL_cpuinfo.h"
33 #if MMX_ASMBLIT 39 #if GCC_ASMBLIT
34 #include "mmx.h" 40 #include "mmx.h"
41 #elif MSVC_ASMBLIT
42 #include <mmintrin.h>
43 #include <mm3dnow.h>
35 #endif 44 #endif
36 45
37 /* Functions to perform alpha blended blitting */ 46 /* Functions to perform alpha blended blitting */
38 47
39 /* N->1 blending with per-surface alpha */ 48 /* N->1 blending with per-surface alpha */
196 src += srcskip; 205 src += srcskip;
197 dst += dstskip; 206 dst += dstskip;
198 } 207 }
199 } 208 }
200 209
201 #if MMX_ASMBLIT 210 #if GCC_ASMBLIT
202 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ 211 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
203 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) 212 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
204 { 213 {
205 int width = info->d_width; 214 int width = info->d_width;
206 int height = info->d_height; 215 int height = info->d_height;
207 Uint32 *srcp = (Uint32 *)info->s_pixels; 216 Uint32 *srcp = (Uint32 *)info->s_pixels;
208 int srcskip = info->s_skip >> 2; 217 int srcskip = info->s_skip >> 2;
209 Uint32 *dstp = (Uint32 *)info->d_pixels; 218 Uint32 *dstp = (Uint32 *)info->d_pixels;
210 int dstskip = info->d_skip >> 2; 219 int dstskip = info->d_skip >> 2;
211 Uint8 load[8]; 220 Uint32 dalpha = info->dst->Amask;
212 221 Uint8 load[8];
213 *(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */ 222
214 movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */ 223 *(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */
215 *(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */ 224 movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */
216 movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */ 225 *(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */
217 *(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */ 226 movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */
218 movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */ 227 movd_m2r(dalpha, mm7); /* dst alpha mask */
228 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
219 while(height--) { 229 while(height--) {
220 DUFFS_LOOP_DOUBLE2( 230 DUFFS_LOOP_DOUBLE2(
221 { 231 {
222 Uint32 s = *srcp++; 232 Uint32 s = *srcp++;
223 Uint32 d = *dstp; 233 Uint32 d = *dstp;
224 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) 234 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
225 + (s & d & 0x00010101)) | 0xff000000; 235 + (s & d & 0x00010101)) | dalpha;
226 },{ 236 },{
227 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ 237 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
228 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ 238 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
229 239
230 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ 240 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */
231 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ 241 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */
232 242
233 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ 243 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */
234 pand_r2r(mm4, mm5); /* src & mask -> mm5 */ 244 pand_r2r(mm4, mm5); /* src & mask -> mm5 */
235 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ 245 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */
236 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ 246 pand_r2r(mm1, mm2); /* src & dst -> mm2 */
237 247 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */
238 pand_r2r(mm1, mm2); /* src & dst -> mm2 */ 248 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */
239 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ 249 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */
240 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ 250
241 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ 251 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
242 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ 252 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */
243 dstp += 2; 253 dstp += 2;
244 srcp += 2; 254 srcp += 2;
245 }, width); 255 }, width);
246 srcp += srcskip; 256 srcp += srcskip;
247 dstp += dstskip; 257 dstp += dstskip;
248 } 258 }
249 emms(); 259 emms();
250 } 260 }
251 261
252 /* fast RGB888->(A)RGB888 blending with surface alpha */ 262 /* fast RGB888->(A)RGB888 blending with surface alpha */
253 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) 263 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
254 { 264 {
265 SDL_PixelFormat* df = info->dst;
255 unsigned alpha = info->src->alpha; 266 unsigned alpha = info->src->alpha;
256 if(alpha == 128) { 267
268 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
269 /* only call a128 version when R,G,B occupy lower bits */
257 BlitRGBtoRGBSurfaceAlpha128MMX(info); 270 BlitRGBtoRGBSurfaceAlpha128MMX(info);
258 } else { 271 } else {
259 int width = info->d_width; 272 int width = info->d_width;
260 int height = info->d_height; 273 int height = info->d_height;
261 Uint32 *srcp = (Uint32 *)info->s_pixels; 274 Uint32 *srcp = (Uint32 *)info->s_pixels;
262 int srcskip = info->s_skip >> 2; 275 int srcskip = info->s_skip >> 2;
263 Uint32 *dstp = (Uint32 *)info->d_pixels; 276 Uint32 *dstp = (Uint32 *)info->d_pixels;
264 int dstskip = info->d_skip >> 2; 277 int dstskip = info->d_skip >> 2;
265 Uint8 load[8] = {alpha, alpha, alpha, alpha, 278
266 alpha, alpha, alpha, alpha}; 279 pxor_r2r(mm5, mm5); /* 0 -> mm5 */
267 280 /* form the alpha mult */
268 movq_m2r(*load, mm4); /* alpha -> mm4 */ 281 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */
269 *(Uint64 *)load = 0x00FF00FF00FF00FFULL; 282 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
270 movq_m2r(*load, mm3); /* mask -> mm3 */ 283 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
271 pand_r2r(mm3, mm4); /* mm4 & mask -> 0A0A0A0A -> mm4 */ 284 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
272 *(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */ 285 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */
273 movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */ 286 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */
287 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */
288 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */
289 movd_m2r(df->Amask, mm7); /* dst alpha mask */
290 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */
274 291
275 while(height--) { 292 while(height--) {
276 DUFFS_LOOP_DOUBLE2({ 293 DUFFS_LOOP_DOUBLE2({
277 /* One Pixel Blend */ 294 /* One Pixel Blend */
278 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ 295 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
279 punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */ 296 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
280 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */ 297 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */
281 298 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */
282 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ 299
283 movq_r2r(mm2, mm6);/* dst(ARGB) -> mm6 (0000ARGB)*/ 300 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
284 punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */ 301 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
285 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ 302 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
286 303 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
287 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ 304
288 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ 305 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */
289 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ 306 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
290 paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ 307 movd_r2m(mm2, *dstp);/* mm2 -> pixel */
291 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */
292 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */
293 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
294 movd_r2m(mm2, *dstp);/* mm2 -> Pixel */
295 ++srcp; 308 ++srcp;
296 ++dstp; 309 ++dstp;
297 },{ 310 },{
298 /* Two Pixels Blend */ 311 /* Two Pixels Blend */
299 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/ 312 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/
300 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ 313 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */
301 punpcklbw_r2r(mm0, mm0); /* low - AARRGGBB -> mm0 */ 314 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */
302 pand_r2r(mm3, mm0); /* 0A0R0G0B -> mm0(src1) */ 315 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */
303 punpckhbw_r2r(mm1, mm1); /* high - AARRGGBB -> mm1 */ 316
304 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1(src2) */ 317 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */
305 318 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */
306 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ 319 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */
307 movq_r2r(mm2, mm5); /* 2 x dst -> mm5(ARGBARGB) */ 320 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */
308 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ 321
309 punpcklbw_r2r(mm2, mm2); /* low - AARRGGBB -> mm2 */ 322 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */
310 punpckhbw_r2r(mm6, mm6); /* high - AARRGGBB -> mm6 */ 323 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */
311 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2(dst1) */ 324 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */
312 325 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */
313 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ 326
314 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ 327 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */
315 pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6(dst2) */ 328 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
316 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ 329 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */
317 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ 330 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */
318 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ 331
319 paddw_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ 332 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */
320 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm0 */ 333 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */
321 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ 334
322 paddw_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ 335 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */
323 pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6 */ 336
324 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ 337 srcp += 2;
325 packuswb_r2r(mm6, mm6); /* ARGBARGB -> mm6 */ 338 dstp += 2;
326 psrlq_i2r(32, mm2); /* mm2 >> 32 -> mm2 */ 339 }, width);
327 psllq_i2r(32, mm6); /* mm6 << 32 -> mm6 */
328 por_r2r(mm6, mm2); /* mm6 | mm2 -> mm2 */
329 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */
330 movq_r2m(mm2, *dstp);/* mm2 -> 2 x Pixel */
331 srcp += 2;
332 dstp += 2;
333 }, width);
334 srcp += srcskip; 340 srcp += srcskip;
335 dstp += dstskip; 341 dstp += dstskip;
336 } 342 }
337 emms(); 343 emms();
338 } 344 }
345 int height = info->d_height; 351 int height = info->d_height;
346 Uint32 *srcp = (Uint32 *)info->s_pixels; 352 Uint32 *srcp = (Uint32 *)info->s_pixels;
347 int srcskip = info->s_skip >> 2; 353 int srcskip = info->s_skip >> 2;
348 Uint32 *dstp = (Uint32 *)info->d_pixels; 354 Uint32 *dstp = (Uint32 *)info->d_pixels;
349 int dstskip = info->d_skip >> 2; 355 int dstskip = info->d_skip >> 2;
350 Uint32 alpha = 0; 356 SDL_PixelFormat* sf = info->src;
351 Uint8 load[8]; 357 Uint32 amask = sf->Amask;
352 358
353 *(Uint64 *)load = 0x00FF00FF00FF00FFULL; 359 pxor_r2r(mm6, mm6); /* 0 -> mm6 */
354 movq_m2r(*load, mm3); /* mask -> mm2 */ 360 /* form multiplication mask */
355 *(Uint64 *)load = 0x00FF000000000000ULL; 361 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */
356 movq_m2r(*load, mm7); /* dst alpha mask -> mm2 */ 362 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */
357 *(Uint64 *)load = 0x00FFFFFF00FFFFFFULL; 363 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */
358 movq_m2r(*load, mm0); /* alpha 255 mask -> mm0 */ 364 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */
359 *(Uint64 *)load = 0xFF000000FF000000ULL; 365 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */
360 movq_m2r(*load, mm6); /* alpha 255 !mask -> mm6 */ 366 /* form channel masks */
367 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */
368 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */
369 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */
370 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */
371 /* get alpha channel shift */
372 movd_m2r(sf->Ashift, mm5); /* Ashift -> mm5 */
373
361 while(height--) { 374 while(height--) {
362 DUFFS_LOOP4({ 375 DUFFS_LOOP4({
363 alpha = *srcp; 376 Uint32 alpha = *srcp & amask;
364 alpha >>= 24;
365 /* FIXME: Here we special-case opaque alpha since the 377 /* FIXME: Here we special-case opaque alpha since the
366 compositioning used (>>8 instead of /255) doesn't handle 378 compositioning used (>>8 instead of /255) doesn't handle
367 it correctly. Also special-case alpha=0 for speed? 379 it correctly. Also special-case alpha=0 for speed?
368 Benchmark this! */ 380 Benchmark this! */
369 if(alpha) { 381 if(alpha == 0) {
370 if(alpha == SDL_ALPHA_OPAQUE) { 382 /* do nothing */
371 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ 383 } else if(alpha == amask) {
372 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ 384 /* opaque alpha -- copy RGB, keep dst alpha */
373 pand_r2r(mm0, mm1); 385 /* using MMX here to free up regular registers for other things */
374 pand_r2r(mm6, mm2); 386 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
375 por_r2r(mm1, mm2); 387 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
376 movd_r2m(mm2, (*dstp)); 388 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */
377 } else { 389 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */
378 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ 390 por_r2r(mm1, mm2); /* src | dst -> mm2 */
379 punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */ 391 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */
380 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */ 392 } else {
381 393 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/
382 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ 394 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */
383 punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */ 395
384 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ 396 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/
385 397 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */
386 movq_r2r(mm2, mm5);/* mm2(0A0R0G0B) -> mm5 */ 398
387 pand_r2r(mm7, mm5); /* mm5 & dst alpha mask -> mm5(0A000000) */ 399 __asm__ __volatile__ (
388 psrlq_i2r(24, mm5); /* mm5 >> 24 -> mm5 (0000A000)*/ 400 "movd %0, %%mm4"
389 401 : : "r" (alpha) ); /* 0000A000 -> mm4 */
390 movq_r2r(mm1, mm4);/* mm1(0A0R0G0B) -> mm4 */ 402 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */
391 psrlq_i2r(48, mm4); /* mm4 >> 48 -> mm4(0000000A) */ 403 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */
392 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ 404 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */
393 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ 405 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */
394 406
395 /* blend */ 407 /* blend */
396 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ 408 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */
397 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ 409 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */
398 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ 410 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */
399 paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ 411 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */
400 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ 412
401 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ 413 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */
402 pand_r2r(mm0, mm2); /* 0RGB0RGB -> mm2 */ 414 movd_r2m(mm2, *dstp);/* mm2 -> dst */
403 por_r2r(mm5, mm2); /* dst alpha | mm2 -> mm2 */
404 movd_r2m(mm2, *dstp);/* mm2 -> dst */
405 }
406 } 415 }
407 ++srcp; 416 ++srcp;
408 ++dstp; 417 ++dstp;
409 }, width); 418 }, width);
410 srcp += srcskip; 419 srcp += srcskip;
411 dstp += dstskip; 420 dstp += dstskip;
412 } 421 }
413 emms(); 422 emms();
414 } 423 }
415 #endif 424 /* End GCC_ASMBLIT */
425
426 #elif MSVC_ASMBLIT
427 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
428 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info)
429 {
430 int width = info->d_width;
431 int height = info->d_height;
432 Uint32 *srcp = (Uint32 *)info->s_pixels;
433 int srcskip = info->s_skip >> 2;
434 Uint32 *dstp = (Uint32 *)info->d_pixels;
435 int dstskip = info->d_skip >> 2;
436 Uint32 dalpha = info->dst->Amask;
437
438 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
439
440 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
441 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
442 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
443
444 while (height--) {
445 int n = width;
446 if ( n & 1 ) {
447 Uint32 s = *srcp++;
448 Uint32 d = *dstp;
449 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
450 + (s & d & 0x00010101)) | dalpha;
451 n--;
452 }
453
454 for (n >>= 1; n > 0; --n) {
455 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */
456 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
457
458 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */
459 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
460
461 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
462 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
463 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
464 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
465
466 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
467 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
468 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
469 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
470
471 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */
472 dstp += 2;
473 srcp += 2;
474 }
475
476 srcp += srcskip;
477 dstp += dstskip;
478 }
479 _mm_empty();
480 }
481
482 /* fast RGB888->(A)RGB888 blending with surface alpha */
483 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info)
484 {
485 SDL_PixelFormat* df = info->dst;
486 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask;
487 unsigned alpha = info->src->alpha;
488
489 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
490 /* only call a128 version when R,G,B occupy lower bits */
491 BlitRGBtoRGBSurfaceAlpha128MMX(info);
492 } else {
493 int width = info->d_width;
494 int height = info->d_height;
495 Uint32 *srcp = (Uint32 *)info->s_pixels;
496 int srcskip = info->s_skip >> 2;
497 Uint32 *dstp = (Uint32 *)info->d_pixels;
498 int dstskip = info->d_skip >> 2;
499 Uint32 dalpha = df->Amask;
500 Uint32 amult;
501
502 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
503
504 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
505 /* form the alpha mult */
506 amult = alpha | (alpha << 8);
507 amult = amult | (amult << 16);
508 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift);
509 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
510 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
511 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
512 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
513
514 while (height--) {
515 int n = width;
516 if (n & 1) {
517 /* One Pixel Blend */
518 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/
519 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
520
521 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
522 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
523
524 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
525 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
526 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
527 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
528
529 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
530 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
531 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
532
533 ++srcp;
534 ++dstp;
535
536 n--;
537 }
538
539 for (n >>= 1; n > 0; --n) {
540 /* Two Pixels Blend */
541 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/
542 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
543 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
544 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
545
546 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */
547 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
548 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
549 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
550
551 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
552 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
553 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
554 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
555
556 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */
557 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
558 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
559 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
560
561 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
562 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
563
564 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */
565
566 srcp += 2;
567 dstp += 2;
568 }
569 srcp += srcskip;
570 dstp += dstskip;
571 }
572 _mm_empty();
573 }
574 }
575
576 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
577 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info)
578 {
579 int width = info->d_width;
580 int height = info->d_height;
581 Uint32 *srcp = (Uint32 *)info->s_pixels;
582 int srcskip = info->s_skip >> 2;
583 Uint32 *dstp = (Uint32 *)info->d_pixels;
584 int dstskip = info->d_skip >> 2;
585 SDL_PixelFormat* sf = info->src;
586 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
587 Uint32 amask = sf->Amask;
588 Uint32 ashift = sf->Ashift;
589 Uint64 multmask;
590
591 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
592
593 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
594 multmask = ~(0xFFFFi64 << (ashift * 2));
595 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
596
597 while(height--) {
598 DUFFS_LOOP4({
599 Uint32 alpha = *srcp & amask;
600 if (alpha == 0) {
601 /* do nothing */
602 } else if (alpha == amask) {
603 /* opaque alpha -- copy RGB, keep dst alpha */
604 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
605 } else {
606 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
607 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
608
609 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
610 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
611
612 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
613 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
614 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
615 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
616 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
617
618 /* blend */
619 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */
620 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */
621 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
622 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */
623 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
624
625 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
626 }
627 ++srcp;
628 ++dstp;
629 }, width);
630 srcp += srcskip;
631 dstp += dstskip;
632 }
633 _mm_empty();
634 }
635 /* End MSVC_ASMBLIT */
636
637 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
416 638
417 #if SDL_ALTIVEC_BLITTERS 639 #if SDL_ALTIVEC_BLITTERS
418 #if HAVE_ALTIVEC_H 640 #if HAVE_ALTIVEC_H
419 #include <altivec.h> 641 #include <altivec.h>
420 #endif 642 #endif
1324 srcp += srcskip; 1546 srcp += srcskip;
1325 dstp += dstskip; 1547 dstp += dstskip;
1326 } 1548 }
1327 } 1549 }
1328 1550
1329 #if MMX_ASMBLIT 1551 #if GCC_ASMBLIT
1330 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ 1552 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1331 inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) 1553 inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1332 { 1554 {
1333 int width = info->d_width; 1555 int width = info->d_width;
1334 int height = info->d_height; 1556 int height = info->d_height;
1335 Uint32 *srcp = (Uint32 *)info->s_pixels; 1557 Uint32 *srcp = (Uint32 *)info->s_pixels;
1336 int srcskip = info->s_skip >> 2; 1558 int srcskip = info->s_skip >> 2;
1337 Uint32 *dstp = (Uint32 *)info->d_pixels; 1559 Uint32 *dstp = (Uint32 *)info->d_pixels;
1338 int dstskip = info->d_skip >> 2; 1560 int dstskip = info->d_skip >> 2;
1339 1561 SDL_PixelFormat* sf = info->src;
1340 Uint32 s; 1562 Uint32 amask = sf->Amask;
1341 Uint32 alpha;
1342 1563
1343 __asm__ ( 1564 __asm__ (
1344 /* make mm6 all zeros. */ 1565 /* make mm6 all zeros. */
1345 "pxor %%mm6, %%mm6\n" 1566 "pxor %%mm6, %%mm6\n"
1346 1567
1347 /* Make a mask to preserve the alpha. */ 1568 /* Make a mask to preserve the alpha. */
1348 "pcmpeqb %%mm7, %%mm7\n\t" /* mm7(s) = FF FF FF FF | FF FF FF FF */ 1569 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */
1349 "psrlq $16, %%mm7\n\t" /* mm7(s) = 00 00 FF FF | FF FF FF FF */ 1570 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */
1350 1571 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */
1351 : ); 1572 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */
1573 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */
1574
1575 /* form channel masks */
1576 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */
1577 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */
1578 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */
1579 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */
1580
1581 /* get alpha channel shift */
1582 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */
1583
1584 : /* nothing */ : "m" (sf->Amask), "m" (sf->Ashift) );
1352 1585
1353 while(height--) { 1586 while(height--) {
1354 1587
1355 DUFFS_LOOP4({ 1588 DUFFS_LOOP4({
1589 Uint32 alpha;
1356 1590
1357 __asm__ ( 1591 __asm__ (
1358 "prefetch 64(%0)\n" 1592 "prefetch 64(%0)\n"
1359 "prefetch 64(%1)\n" 1593 "prefetch 64(%1)\n"
1360 : : "r" (srcp), "r" (dstp) ); 1594 : : "r" (srcp), "r" (dstp) );
1361 1595
1362 s = *srcp; 1596 alpha = *srcp & amask;
1363 alpha = s >> 24;
1364 /* FIXME: Here we special-case opaque alpha since the 1597 /* FIXME: Here we special-case opaque alpha since the
1365 compositioning used (>>8 instead of /255) doesn't handle 1598 compositioning used (>>8 instead of /255) doesn't handle
1366 it correctly. Also special-case alpha=0 for speed? 1599 it correctly. Also special-case alpha=0 for speed?
1367 Benchmark this! */ 1600 Benchmark this! */
1368 1601 if(alpha == 0) {
1369 if(alpha == SDL_ALPHA_OPAQUE) { 1602 /* do nothing */
1370 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); 1603 }
1604 else if(alpha == amask) {
1605 /* opaque alpha -- copy RGB, keep dst alpha */
1606 /* using MMX here to free up regular registers for other things */
1607 __asm__ (
1608 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/
1609 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/
1610 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */
1611 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */
1612 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */
1613 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */
1614
1615 : : "r" (srcp), "r" (dstp) );
1371 } 1616 }
1372 1617
1373 else { 1618 else {
1374 __asm__ ( 1619 __asm__ (
1375 /* load in the source, and dst. */ 1620 /* load in the source, and dst. */
1381 /* if supporting pshufw */ 1626 /* if supporting pshufw */
1382 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */ 1627 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */
1383 /*"psrlw $8, %%mm2\n" */ 1628 /*"psrlw $8, %%mm2\n" */
1384 1629
1385 /* else: */ 1630 /* else: */
1386 "movq %%mm0, %%mm2\n" 1631 "movd %2, %%mm2\n"
1387 "psrld $24, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ 1632 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */
1388 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */ 1633 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */
1389 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */ 1634 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */
1635 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */
1390 1636
1391 /* move the colors into words. */ 1637 /* move the colors into words. */
1392 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */ 1638 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */
1393 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */ 1639 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */
1394 1640
1395 /* src - dst */ 1641 /* src - dst */
1396 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */ 1642 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */
1397 1643
1398 /* A * (src-dst) */ 1644 /* A * (src-dst) */
1399 "pmullw %%mm2, %%mm0\n" /* mm0 = As*As-d As*Rs-d | As*Gs-d As*Bs-d */ 1645 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */
1400 "pand %%mm7, %%mm0\n" /* to preserve dest alpha */ 1646 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */
1401 "psrlw $8, %%mm0\n" /* mm0 = Ac>>8 Rc>>8 | Gc>>8 Bc>>8 */ 1647 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */
1402 "paddb %%mm1, %%mm0\n" /* mm0 = Ac+Ad Rc+Rd | Gc+Gd Bc+Bd */
1403 1648
1404 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */ 1649 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */
1405 1650
1406 "movd %%mm0, (%1)\n" /* result in mm0 */ 1651 "movd %%mm0, (%1)\n" /* result in mm0 */
1407 1652
1408 : : "r" (srcp), "r" (dstp) ); 1653 : : "r" (srcp), "r" (dstp), "r" (alpha) );
1409 1654
1410 } 1655 }
1411 ++srcp; 1656 ++srcp;
1412 ++dstp; 1657 ++dstp;
1413 }, width); 1658 }, width);
1417 1662
1418 __asm__ ( 1663 __asm__ (
1419 "emms\n" 1664 "emms\n"
1420 : ); 1665 : );
1421 } 1666 }
1422 #endif 1667 /* End GCC_ASMBLIT*/
1668
1669 #elif MSVC_ASMBLIT
1670 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
1671 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info)
1672 {
1673 int width = info->d_width;
1674 int height = info->d_height;
1675 Uint32 *srcp = (Uint32 *)info->s_pixels;
1676 int srcskip = info->s_skip >> 2;
1677 Uint32 *dstp = (Uint32 *)info->d_pixels;
1678 int dstskip = info->d_skip >> 2;
1679 SDL_PixelFormat* sf = info->src;
1680 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask;
1681 Uint32 amask = sf->Amask;
1682 Uint32 ashift = sf->Ashift;
1683 Uint64 multmask;
1684
1685 __m64 src1, dst1, mm_alpha, mm_zero, dmask;
1686
1687 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
1688 multmask = ~(0xFFFFi64 << (ashift * 2));
1689 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */
1690
1691 while(height--) {
1692 DUFFS_LOOP4({
1693 Uint32 alpha;
1694
1695 _m_prefetch(srcp + 16);
1696 _m_prefetch(dstp + 16);
1697
1698 alpha = *srcp & amask;
1699 if (alpha == 0) {
1700 /* do nothing */
1701 } else if (alpha == amask) {
1702 /* copy RGB, keep dst alpha */
1703 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask);
1704 } else {
1705 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/
1706 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
1707
1708 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/
1709 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
1710
1711 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
1712 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
1713 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
1714 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
1715 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */
1716
1717 /* blend */
1718 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */
1719 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */
1720 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */
1721 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */
1722 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
1723
1724 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
1725 }
1726 ++srcp;
1727 ++dstp;
1728 }, width);
1729 srcp += srcskip;
1730 dstp += dstskip;
1731 }
1732 _mm_empty();
1733 }
1734 /* End MSVC_ASMBLIT */
1735
1736 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1423 1737
1424 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ 1738 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
1425 1739
1426 /* blend a single 16 bit pixel at 50% */ 1740 /* blend a single 16 bit pixel at 50% */
1427 #define BLEND16_50(d, s, mask) \ 1741 #define BLEND16_50(d, s, mask) \
1528 dstp += dstskip; 1842 dstp += dstskip;
1529 } 1843 }
1530 } 1844 }
1531 } 1845 }
1532 1846
1533 #if MMX_ASMBLIT 1847 #if GCC_ASMBLIT
1534 /* fast RGB565->RGB565 blending with surface alpha */ 1848 /* fast RGB565->RGB565 blending with surface alpha */
1535 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) 1849 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
1536 { 1850 {
1537 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ 1851 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */
1538 if(alpha == 128) { 1852 if(alpha == 128) {
1542 int height = info->d_height; 1856 int height = info->d_height;
1543 Uint16 *srcp = (Uint16 *)info->s_pixels; 1857 Uint16 *srcp = (Uint16 *)info->s_pixels;
1544 int srcskip = info->s_skip >> 1; 1858 int srcskip = info->s_skip >> 1;
1545 Uint16 *dstp = (Uint16 *)info->d_pixels; 1859 Uint16 *dstp = (Uint16 *)info->d_pixels;
1546 int dstskip = info->d_skip >> 1; 1860 int dstskip = info->d_skip >> 1;
1547 Uint32 s, d; 1861 Uint32 s, d;
1548 Uint8 load[8]; 1862 Uint8 load[8];
1549 1863
1550 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 1864 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1551 *(Uint64 *)load = alpha; 1865 *(Uint64 *)load = alpha;
1552 alpha >>= 3; /* downscale alpha to 5 bits */ 1866 alpha >>= 3; /* downscale alpha to 5 bits */
1553 1867
1554 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ 1868 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
1555 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ 1869 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1556 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ 1870 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1871 /* position alpha to allow for mullo and mulhi on diff channels
1872 to reduce the number of operations */
1873 psllq_i2r(3, mm0);
1557 1874
1558 /* Setup the 565 color channel masks */ 1875 /* Setup the 565 color channel masks */
1559 *(Uint64 *)load = 0xF800F800F800F800ULL;
1560 movq_m2r(*load, mm1); /* MASKRED -> mm1 */
1561 *(Uint64 *)load = 0x07E007E007E007E0ULL; 1876 *(Uint64 *)load = 0x07E007E007E007E0ULL;
1562 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ 1877 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
1563 *(Uint64 *)load = 0x001F001F001F001FULL; 1878 *(Uint64 *)load = 0x001F001F001F001FULL;
1564 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ 1879 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
1565 while(height--) { 1880 while(height--) {
1566 DUFFS_LOOP_QUATRO2( 1881 DUFFS_LOOP_QUATRO2(
1567 { 1882 {
1568 s = *srcp++; 1883 s = *srcp++;
1569 d = *dstp; 1884 d = *dstp;
1570 /* 1885 /*
1571 * shift out the middle component (green) to 1886 * shift out the middle component (green) to
1572 * the high 16 bits, and process all three RGB 1887 * the high 16 bits, and process all three RGB
1573 * components at the same time. 1888 * components at the same time.
1575 s = (s | s << 16) & 0x07e0f81f; 1890 s = (s | s << 16) & 0x07e0f81f;
1576 d = (d | d << 16) & 0x07e0f81f; 1891 d = (d | d << 16) & 0x07e0f81f;
1577 d += (s - d) * alpha >> 5; 1892 d += (s - d) * alpha >> 5;
1578 d &= 0x07e0f81f; 1893 d &= 0x07e0f81f;
1579 *dstp++ = d | d >> 16; 1894 *dstp++ = d | d >> 16;
1580 },{ 1895 },{
1581 s = *srcp++; 1896 s = *srcp++;
1582 d = *dstp; 1897 d = *dstp;
1583 /* 1898 /*
1584 * shift out the middle component (green) to 1899 * shift out the middle component (green) to
1585 * the high 16 bits, and process all three RGB 1900 * the high 16 bits, and process all three RGB
1586 * components at the same time. 1901 * components at the same time.
1588 s = (s | s << 16) & 0x07e0f81f; 1903 s = (s | s << 16) & 0x07e0f81f;
1589 d = (d | d << 16) & 0x07e0f81f; 1904 d = (d | d << 16) & 0x07e0f81f;
1590 d += (s - d) * alpha >> 5; 1905 d += (s - d) * alpha >> 5;
1591 d &= 0x07e0f81f; 1906 d &= 0x07e0f81f;
1592 *dstp++ = d | d >> 16; 1907 *dstp++ = d | d >> 16;
1593 s = *srcp++; 1908 s = *srcp++;
1594 d = *dstp; 1909 d = *dstp;
1595 /* 1910 /*
1596 * shift out the middle component (green) to 1911 * shift out the middle component (green) to
1597 * the high 16 bits, and process all three RGB 1912 * the high 16 bits, and process all three RGB
1598 * components at the same time. 1913 * components at the same time.
1600 s = (s | s << 16) & 0x07e0f81f; 1915 s = (s | s << 16) & 0x07e0f81f;
1601 d = (d | d << 16) & 0x07e0f81f; 1916 d = (d | d << 16) & 0x07e0f81f;
1602 d += (s - d) * alpha >> 5; 1917 d += (s - d) * alpha >> 5;
1603 d &= 0x07e0f81f; 1918 d &= 0x07e0f81f;
1604 *dstp++ = d | d >> 16; 1919 *dstp++ = d | d >> 16;
1605 },{ 1920 },{
1606 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ 1921 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1607 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ 1922 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1608 1923
1609 /* RED */ 1924 /* red -- does not need a mask since the right shift clears
1610 movq_r2r(mm2, mm5); /* src -> mm5 */ 1925 the uninteresting bits */
1611 pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */ 1926 movq_r2r(mm2, mm5); /* src -> mm5 */
1612 psrlq_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ 1927 movq_r2r(mm3, mm6); /* dst -> mm6 */
1613 1928 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */
1614 movq_r2r(mm3, mm6); /* dst -> mm6 */ 1929 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */
1615 pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */ 1930
1616 psrlq_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ 1931 /* blend */
1617 1932 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1618 /* blend */ 1933 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1619 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 1934 /* alpha used is actually 11 bits
1620 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 1935 11 + 5 = 16 bits, so the sign bits are lost */
1621 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ 1936 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1622 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 1937 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1623 psllq_i2r(11, mm6); /* mm6 << 11 -> mm6 */ 1938 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */
1624 pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */ 1939
1625 1940 movq_r2r(mm6, mm1); /* save new reds in dsts */
1626 movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */ 1941
1627 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ 1942 /* green -- process the bits in place */
1628 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */ 1943 movq_r2r(mm2, mm5); /* src -> mm5 */
1629 por_r2r(mm6, mm3); /* save new reds in dsts */ 1944 movq_r2r(mm3, mm6); /* dst -> mm6 */
1630 1945 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1631 /* green */ 1946 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1632 movq_r2r(mm2, mm5); /* src -> mm5 */ 1947
1633 pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */ 1948 /* blend */
1634 psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */ 1949 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1635 1950 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1636 movq_r2r(mm3, mm6); /* dst -> mm6 */ 1951 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting
1637 pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */ 1952 bits are gone and the sign bits present */
1638 psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */ 1953 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
1639 1954 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1640 /* blend */ 1955
1641 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 1956 por_r2r(mm6, mm1); /* save new greens in dsts */
1642 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 1957
1643 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ 1958 /* blue */
1644 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 1959 movq_r2r(mm2, mm5); /* src -> mm5 */
1645 psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */ 1960 movq_r2r(mm3, mm6); /* dst -> mm6 */
1646 pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */ 1961 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
1647 1962 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
1648 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ 1963
1649 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ 1964 /* blend */
1650 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */ 1965 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1651 por_r2r(mm6, mm3); /* save new greens in dsts */ 1966 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1652 1967 /* 11 + 5 = 16 bits, so the sign bits are lost and
1653 /* blue */ 1968 the interesting bits will need to be MASKed */
1654 movq_r2r(mm2, mm5); /* src -> mm5 */ 1969 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1655 pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */ 1970 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1656 1971 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
1657 movq_r2r(mm3, mm6); /* dst -> mm6 */ 1972
1658 pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ 1973 por_r2r(mm6, mm1); /* save new blues in dsts */
1659 1974
1660 /* blend */ 1975 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */
1661 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 1976
1662 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 1977 srcp += 4;
1663 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ 1978 dstp += 4;
1664 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 1979 }, width);
1665 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */
1666
1667 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */
1668 por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */
1669 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */
1670 por_r2r(mm6, mm3); /* save new blues in dsts */
1671
1672 movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
1673
1674 srcp += 4;
1675 dstp += 4;
1676 }, width);
1677 srcp += srcskip; 1980 srcp += srcskip;
1678 dstp += dstskip; 1981 dstp += dstskip;
1679 } 1982 }
1680 emms(); 1983 emms();
1681 } 1984 }
1692 int height = info->d_height; 1995 int height = info->d_height;
1693 Uint16 *srcp = (Uint16 *)info->s_pixels; 1996 Uint16 *srcp = (Uint16 *)info->s_pixels;
1694 int srcskip = info->s_skip >> 1; 1997 int srcskip = info->s_skip >> 1;
1695 Uint16 *dstp = (Uint16 *)info->d_pixels; 1998 Uint16 *dstp = (Uint16 *)info->d_pixels;
1696 int dstskip = info->d_skip >> 1; 1999 int dstskip = info->d_skip >> 1;
1697 Uint32 s, d; 2000 Uint32 s, d;
1698 Uint8 load[8]; 2001 Uint8 load[8];
1699 2002
1700 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ 2003 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
1701 *(Uint64 *)load = alpha; 2004 *(Uint64 *)load = alpha;
1702 alpha >>= 3; /* downscale alpha to 5 bits */ 2005 alpha >>= 3; /* downscale alpha to 5 bits */
1703 2006
1704 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ 2007 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */
1705 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ 2008 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */
1706 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ 2009 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */
1707 2010 /* position alpha to allow for mullo and mulhi on diff channels
1708 /* Setup the 555 color channel masks */ 2011 to reduce the number of operations */
1709 *(Uint64 *)load = 0x7C007C007C007C00ULL; 2012 psllq_i2r(3, mm0);
1710 movq_m2r(*load, mm1); /* MASKRED -> mm1 */ 2013
2014 /* Setup the 555 color channel masks */
1711 *(Uint64 *)load = 0x03E003E003E003E0ULL; 2015 *(Uint64 *)load = 0x03E003E003E003E0ULL;
1712 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ 2016 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */
1713 *(Uint64 *)load = 0x001F001F001F001FULL; 2017 *(Uint64 *)load = 0x001F001F001F001FULL;
1714 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ 2018 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */
1715 while(height--) { 2019 while(height--) {
1716 DUFFS_LOOP_QUATRO2( 2020 DUFFS_LOOP_QUATRO2(
1717 { 2021 {
1718 s = *srcp++; 2022 s = *srcp++;
1719 d = *dstp; 2023 d = *dstp;
1720 /* 2024 /*
1721 * shift out the middle component (green) to 2025 * shift out the middle component (green) to
1722 * the high 16 bits, and process all three RGB 2026 * the high 16 bits, and process all three RGB
1723 * components at the same time. 2027 * components at the same time.
1725 s = (s | s << 16) & 0x03e07c1f; 2029 s = (s | s << 16) & 0x03e07c1f;
1726 d = (d | d << 16) & 0x03e07c1f; 2030 d = (d | d << 16) & 0x03e07c1f;
1727 d += (s - d) * alpha >> 5; 2031 d += (s - d) * alpha >> 5;
1728 d &= 0x03e07c1f; 2032 d &= 0x03e07c1f;
1729 *dstp++ = d | d >> 16; 2033 *dstp++ = d | d >> 16;
1730 },{ 2034 },{
1731 s = *srcp++; 2035 s = *srcp++;
1732 d = *dstp; 2036 d = *dstp;
1733 /* 2037 /*
1734 * shift out the middle component (green) to 2038 * shift out the middle component (green) to
1735 * the high 16 bits, and process all three RGB 2039 * the high 16 bits, and process all three RGB
1736 * components at the same time. 2040 * components at the same time.
1750 s = (s | s << 16) & 0x03e07c1f; 2054 s = (s | s << 16) & 0x03e07c1f;
1751 d = (d | d << 16) & 0x03e07c1f; 2055 d = (d | d << 16) & 0x03e07c1f;
1752 d += (s - d) * alpha >> 5; 2056 d += (s - d) * alpha >> 5;
1753 d &= 0x03e07c1f; 2057 d &= 0x03e07c1f;
1754 *dstp++ = d | d >> 16; 2058 *dstp++ = d | d >> 16;
1755 },{ 2059 },{
1756 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ 2060 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */
1757 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ 2061 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */
1758 2062
1759 /* RED */ 2063 /* red -- process the bits in place */
1760 movq_r2r(mm2, mm5); /* src -> mm5 */ 2064 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */
1761 pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */ 2065 /* by reusing the GREEN mask we free up another mmx
1762 psrlq_i2r(10, mm5); /* mm5 >> 10 -> mm5 [000r 000r 000r 000r] */ 2066 register to accumulate the result */
1763 2067
1764 movq_r2r(mm3, mm6); /* dst -> mm6 */ 2068 movq_r2r(mm2, mm5); /* src -> mm5 */
1765 pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */ 2069 movq_r2r(mm3, mm6); /* dst -> mm6 */
1766 psrlq_i2r(10, mm6); /* mm6 >> 10 -> mm6 [000r 000r 000r 000r] */ 2070 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */
1767 2071 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */
1768 /* blend */ 2072
1769 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 2073 /* blend */
1770 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 2074 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1771 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ 2075 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1772 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 2076 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be
1773 psllq_i2r(10, mm6); /* mm6 << 10 -> mm6 */ 2077 cleared by a MASK below */
1774 pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */ 2078 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
1775 2079 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1776 movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */ 2080 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */
1777 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ 2081
1778 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */ 2082 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */
1779 por_r2r(mm6, mm3); /* save new reds in dsts */ 2083
1780 2084 movq_r2r(mm6, mm1); /* save new reds in dsts */
1781 /* green */ 2085
1782 movq_r2r(mm2, mm5); /* src -> mm5 */ 2086 /* green -- process the bits in place */
1783 pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */ 2087 movq_r2r(mm2, mm5); /* src -> mm5 */
1784 psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */ 2088 movq_r2r(mm3, mm6); /* dst -> mm6 */
1785 2089 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */
1786 movq_r2r(mm3, mm6); /* dst -> mm6 */ 2090 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */
1787 pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */ 2091
1788 psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */ 2092 /* blend */
1789 2093 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1790 /* blend */ 2094 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1791 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 2095 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting
1792 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 2096 bits are gone and the sign bits present */
1793 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ 2097 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */
1794 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 2098 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1795 psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */ 2099
1796 pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */ 2100 por_r2r(mm6, mm1); /* save new greens in dsts */
1797 2101
1798 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ 2102 /* blue */
1799 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ 2103 movq_r2r(mm2, mm5); /* src -> mm5 */
1800 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */ 2104 movq_r2r(mm3, mm6); /* dst -> mm6 */
1801 por_r2r(mm6, mm3); /* save new greens in dsts */ 2105 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */
1802 2106 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */
1803 /* blue */ 2107
1804 movq_r2r(mm2, mm5); /* src -> mm5 */ 2108 /* blend */
1805 pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */ 2109 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */
1806 2110 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */
1807 movq_r2r(mm3, mm6); /* dst -> mm6 */ 2111 /* 11 + 5 = 16 bits, so the sign bits are lost and
1808 pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ 2112 the interesting bits will need to be MASKed */
1809 2113 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */
1810 /* blend */ 2114 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */
1811 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ 2115 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */
1812 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ 2116
1813 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ 2117 por_r2r(mm6, mm1); /* save new blues in dsts */
1814 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ 2118
1815 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */ 2119 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */
1816 2120
1817 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ 2121 srcp += 4;
1818 por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */ 2122 dstp += 4;
1819 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */ 2123 }, width);
1820 por_r2r(mm6, mm3); /* save new blues in dsts */
1821
1822 movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */
1823
1824 srcp += 4;
1825 dstp += 4;
1826 }, width);
1827 srcp += srcskip; 2124 srcp += srcskip;
1828 dstp += dstskip; 2125 dstp += dstskip;
1829 } 2126 }
1830 emms(); 2127 emms();
1831 } 2128 }
1832 } 2129 }
1833 #endif 2130 /* End GCC_ASMBLIT */
2131
2132 #elif MSVC_ASMBLIT
2133 /* fast RGB565->RGB565 blending with surface alpha */
2134 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info)
2135 {
2136 unsigned alpha = info->src->alpha;
2137 if(alpha == 128) {
2138 Blit16to16SurfaceAlpha128(info, 0xf7de);
2139 } else {
2140 int width = info->d_width;
2141 int height = info->d_height;
2142 Uint16 *srcp = (Uint16 *)info->s_pixels;
2143 int srcskip = info->s_skip >> 1;
2144 Uint16 *dstp = (Uint16 *)info->d_pixels;
2145 int dstskip = info->d_skip >> 1;
2146 Uint32 s, d;
2147
2148 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
2149
2150 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2151 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2152 alpha >>= 3; /* downscale alpha to 5 bits */
2153
2154 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2155 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2156 /* position alpha to allow for mullo and mulhi on diff channels
2157 to reduce the number of operations */
2158 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2159
2160 /* Setup the 565 color channel masks */
2161 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
2162 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2163
2164 while(height--) {
2165 DUFFS_LOOP_QUATRO2(
2166 {
2167 s = *srcp++;
2168 d = *dstp;
2169 /*
2170 * shift out the middle component (green) to
2171 * the high 16 bits, and process all three RGB
2172 * components at the same time.
2173 */
2174 s = (s | s << 16) & 0x07e0f81f;
2175 d = (d | d << 16) & 0x07e0f81f;
2176 d += (s - d) * alpha >> 5;
2177 d &= 0x07e0f81f;
2178 *dstp++ = d | d >> 16;
2179 },{
2180 s = *srcp++;
2181 d = *dstp;
2182 /*
2183 * shift out the middle component (green) to
2184 * the high 16 bits, and process all three RGB
2185 * components at the same time.
2186 */
2187 s = (s | s << 16) & 0x07e0f81f;
2188 d = (d | d << 16) & 0x07e0f81f;
2189 d += (s - d) * alpha >> 5;
2190 d &= 0x07e0f81f;
2191 *dstp++ = d | d >> 16;
2192 s = *srcp++;
2193 d = *dstp;
2194 /*
2195 * shift out the middle component (green) to
2196 * the high 16 bits, and process all three RGB
2197 * components at the same time.
2198 */
2199 s = (s | s << 16) & 0x07e0f81f;
2200 d = (d | d << 16) & 0x07e0f81f;
2201 d += (s - d) * alpha >> 5;
2202 d &= 0x07e0f81f;
2203 *dstp++ = d | d >> 16;
2204 },{
2205 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2206 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2207
2208 /* red */
2209 src2 = src1;
2210 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
2211
2212 dst2 = dst1;
2213 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
2214
2215 /* blend */
2216 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2217 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2218 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2219 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2220 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
2221
2222 mm_res = dst2; /* RED -> mm_res */
2223
2224 /* green -- process the bits in place */
2225 src2 = src1;
2226 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2227
2228 dst2 = dst1;
2229 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2230
2231 /* blend */
2232 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2233 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2234 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2235 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2236
2237 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2238
2239 /* blue */
2240 src2 = src1;
2241 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2242
2243 dst2 = dst1;
2244 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2245
2246 /* blend */
2247 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2248 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2249 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2250 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2251 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2252
2253 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2254
2255 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2256
2257 srcp += 4;
2258 dstp += 4;
2259 }, width);
2260 srcp += srcskip;
2261 dstp += dstskip;
2262 }
2263 _mm_empty();
2264 }
2265 }
2266
2267 /* fast RGB555->RGB555 blending with surface alpha */
2268 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info)
2269 {
2270 unsigned alpha = info->src->alpha;
2271 if(alpha == 128) {
2272 Blit16to16SurfaceAlpha128(info, 0xfbde);
2273 } else {
2274 int width = info->d_width;
2275 int height = info->d_height;
2276 Uint16 *srcp = (Uint16 *)info->s_pixels;
2277 int srcskip = info->s_skip >> 1;
2278 Uint16 *dstp = (Uint16 *)info->d_pixels;
2279 int dstskip = info->d_skip >> 1;
2280 Uint32 s, d;
2281
2282 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
2283
2284 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */
2285 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
2286 alpha >>= 3; /* downscale alpha to 5 bits */
2287
2288 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
2289 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
2290 /* position alpha to allow for mullo and mulhi on diff channels
2291 to reduce the number of operations */
2292 mm_alpha = _mm_slli_si64(mm_alpha, 3);
2293
2294 /* Setup the 555 color channel masks */
2295 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
2296 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
2297 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
2298
2299 while(height--) {
2300 DUFFS_LOOP_QUATRO2(
2301 {
2302 s = *srcp++;
2303 d = *dstp;
2304 /*
2305 * shift out the middle component (green) to
2306 * the high 16 bits, and process all three RGB
2307 * components at the same time.
2308 */
2309 s = (s | s << 16) & 0x03e07c1f;
2310 d = (d | d << 16) & 0x03e07c1f;
2311 d += (s - d) * alpha >> 5;
2312 d &= 0x03e07c1f;
2313 *dstp++ = d | d >> 16;
2314 },{
2315 s = *srcp++;
2316 d = *dstp;
2317 /*
2318 * shift out the middle component (green) to
2319 * the high 16 bits, and process all three RGB
2320 * components at the same time.
2321 */
2322 s = (s | s << 16) & 0x03e07c1f;
2323 d = (d | d << 16) & 0x03e07c1f;
2324 d += (s - d) * alpha >> 5;
2325 d &= 0x03e07c1f;
2326 *dstp++ = d | d >> 16;
2327 s = *srcp++;
2328 d = *dstp;
2329 /*
2330 * shift out the middle component (green) to
2331 * the high 16 bits, and process all three RGB
2332 * components at the same time.
2333 */
2334 s = (s | s << 16) & 0x03e07c1f;
2335 d = (d | d << 16) & 0x03e07c1f;
2336 d += (s - d) * alpha >> 5;
2337 d &= 0x03e07c1f;
2338 *dstp++ = d | d >> 16;
2339 },{
2340 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
2341 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
2342
2343 /* red -- process the bits in place */
2344 src2 = src1;
2345 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
2346
2347 dst2 = dst1;
2348 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
2349
2350 /* blend */
2351 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2352 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2353 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2354 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2355 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
2356
2357 mm_res = dst2; /* RED -> mm_res */
2358
2359 /* green -- process the bits in place */
2360 src2 = src1;
2361 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
2362
2363 dst2 = dst1;
2364 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
2365
2366 /* blend */
2367 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2368 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2369 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
2370 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2371
2372 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
2373
2374 /* blue */
2375 src2 = src1; /* src -> src2 */
2376 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
2377
2378 dst2 = dst1; /* dst -> dst2 */
2379 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
2380
2381 /* blend */
2382 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
2383 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
2384 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
2385 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
2386 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
2387
2388 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
2389
2390 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
2391
2392 srcp += 4;
2393 dstp += 4;
2394 }, width);
2395 srcp += srcskip;
2396 dstp += dstskip;
2397 }
2398 _mm_empty();
2399 }
2400 }
2401 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */
1834 2402
1835 /* fast RGB565->RGB565 blending with surface alpha */ 2403 /* fast RGB565->RGB565 blending with surface alpha */
1836 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) 2404 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info)
1837 { 2405 {
1838 unsigned alpha = info->src->alpha; 2406 unsigned alpha = info->src->alpha;
2175 2743
2176 case 4: 2744 case 4:
2177 if(sf->Rmask == df->Rmask 2745 if(sf->Rmask == df->Rmask
2178 && sf->Gmask == df->Gmask 2746 && sf->Gmask == df->Gmask
2179 && sf->Bmask == df->Bmask 2747 && sf->Bmask == df->Bmask
2180 && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff
2181 && sf->BytesPerPixel == 4) 2748 && sf->BytesPerPixel == 4)
2182 { 2749 {
2183 #if MMX_ASMBLIT 2750 #if MMX_ASMBLIT
2184 if(SDL_HasMMX()) 2751 if(sf->Rshift % 8 == 0
2185 return BlitRGBtoRGBSurfaceAlphaMMX; 2752 && sf->Gshift % 8 == 0
2753 && sf->Bshift % 8 == 0
2754 && SDL_HasMMX())
2755 return BlitRGBtoRGBSurfaceAlphaMMX;
2756 #endif
2757 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff)
2758 {
2759 #if USE_ALTIVEC_BLITTERS
2760 if(SDL_HasAltiVec())
2761 return BlitRGBtoRGBSurfaceAlphaAltivec;
2762 #endif
2763 return BlitRGBtoRGBSurfaceAlpha;
2764 }
2765 }
2766 #if SDL_ALTIVEC_BLITTERS
2767 if((sf->BytesPerPixel == 4) &&
2768 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2769 return Blit32to32SurfaceAlphaAltivec;
2186 else 2770 else
2187 #endif 2771 #endif
2188 #if SDL_ALTIVEC_BLITTERS 2772 return BlitNtoNSurfaceAlpha;
2189 if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2190 return BlitRGBtoRGBSurfaceAlphaAltivec;
2191 else
2192 #endif
2193 return BlitRGBtoRGBSurfaceAlpha;
2194 }
2195 else
2196 #if SDL_ALTIVEC_BLITTERS
2197 if((sf->BytesPerPixel == 4) &&
2198 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2199 return Blit32to32SurfaceAlphaAltivec;
2200 else
2201 #endif
2202 return BlitNtoNSurfaceAlpha;
2203 2773
2204 case 3: 2774 case 3:
2205 default: 2775 default:
2206 return BlitNtoNSurfaceAlpha; 2776 return BlitNtoNSurfaceAlpha;
2207 } 2777 }
2230 return BlitARGBto555PixelAlpha; 2800 return BlitARGBto555PixelAlpha;
2231 } 2801 }
2232 return BlitNtoNPixelAlpha; 2802 return BlitNtoNPixelAlpha;
2233 2803
2234 case 4: 2804 case 4:
2235 if(sf->Amask == 0xff000000 2805 if(sf->Rmask == df->Rmask
2236 && sf->Rmask == df->Rmask
2237 && sf->Gmask == df->Gmask 2806 && sf->Gmask == df->Gmask
2238 && sf->Bmask == df->Bmask 2807 && sf->Bmask == df->Bmask
2239 && sf->BytesPerPixel == 4) 2808 && sf->BytesPerPixel == 4)
2240 { 2809 {
2241 #if MMX_ASMBLIT 2810 #if MMX_ASMBLIT
2242 if(SDL_Has3DNow()) 2811 if(sf->Rshift % 8 == 0
2243 return BlitRGBtoRGBPixelAlphaMMX3DNOW; 2812 && sf->Gshift % 8 == 0
2244 else 2813 && sf->Bshift % 8 == 0
2245 if(SDL_HasMMX()) 2814 && sf->Ashift % 8 == 0
2246 return BlitRGBtoRGBPixelAlphaMMX; 2815 && sf->Aloss == 0)
2247 else 2816 {
2817 if(SDL_Has3DNow())
2818 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
2819 if(SDL_HasMMX())
2820 return BlitRGBtoRGBPixelAlphaMMX;
2821 }
2248 #endif 2822 #endif
2249 #if SDL_ALTIVEC_BLITTERS 2823 if(sf->Amask == 0xff000000)
2250 if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) 2824 {
2251 return BlitRGBtoRGBPixelAlphaAltivec; 2825 #if USE_ALTIVEC_BLITTERS
2252 else 2826 if(SDL_HasAltiVec())
2827 return BlitRGBtoRGBPixelAlphaAltivec;
2253 #endif 2828 #endif
2254 return BlitRGBtoRGBPixelAlpha; 2829 return BlitRGBtoRGBPixelAlpha;
2830 }
2255 } 2831 }
2256 #if SDL_ALTIVEC_BLITTERS 2832 #if SDL_ALTIVEC_BLITTERS
2257 if (sf->Amask && sf->BytesPerPixel == 4 && 2833 if (sf->Amask && sf->BytesPerPixel == 4 &&
2258 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) 2834 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec())
2259 return Blit32to32PixelAlphaAltivec; 2835 return Blit32to32PixelAlphaAltivec;
2260 else 2836 else
2261 #endif 2837 #endif
2262 return BlitNtoNPixelAlpha; 2838 return BlitNtoNPixelAlpha;
2263 2839
2264 case 3: 2840 case 3:
2265 default: 2841 default:
2266 return BlitNtoNPixelAlpha; 2842 return BlitNtoNPixelAlpha;
2267 } 2843 }