Mercurial > sdl-ios-xcode
comparison src/video/SDL_blit_A.c @ 1542:a8bf1aa21020
Fixed bug #15
SDL_blit_A.mmx-speed.patch.txt --
Speed improvements and a bugfix for the current GCC inline mmx
asm code:
- Changed some ops and removed some resulting useless ones.
- Added some instruction parallelism (some gain)
The resulting speed on my Xeon improved upto 35% depending on
the function (measured in fps).
- Fixed a bug where BlitRGBtoRGBSurfaceAlphaMMX() was
setting the alpha component on the destination surfaces (to
opaque-alpha) even when the surface had none.
SDL_blit_A.mmx-msvc.patch.txt --
MSVC mmx intrinsics version of the same GCC asm code.
MSVC compiler tries to parallelize the code and to avoid
register stalls, but does not always do a very good job.
Per-surface blending MSVC functions run quite a bit faster
than their pure-asm counterparts (upto 55% faster for 16bit
ones), but the per-pixel blending runs somewhat slower than asm.
- BlitRGBtoRGBSurfaceAlphaMMX and BlitRGBtoRGBPixelAlphaMMX (and all
variants) can now also handle formats other than (A)RGB8888. Formats
like RGBA8888 and some quite exotic ones are allowed -- like
RAGB8888, or actually anything having channels aligned on 8bit
boundary and full 8bit alpha (for per-pixel alpha blending).
The performance cost of this change is virtually 0 for per-surface
alpha blending (no extra ops inside the loop) and a single non-MMX
op inside the loop for per-pixel blending. In testing, the per-pixel
alpha blending takes a ~2% performance hit, but it still runs much
faster than the current code in CVS. If necessary, a separate function
with this functionality can be made.
This code requires Processor Pack for VC6.
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Wed, 15 Mar 2006 15:39:29 +0000 |
parents | dc6b59e925a2 |
children | 4b835e36633d |
comparison
equal
deleted
inserted
replaced
1541:157001382dfd | 1542:a8bf1aa21020 |
---|---|
22 #include "SDL_config.h" | 22 #include "SDL_config.h" |
23 | 23 |
24 #include "SDL_video.h" | 24 #include "SDL_video.h" |
25 #include "SDL_blit.h" | 25 #include "SDL_blit.h" |
26 | 26 |
27 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES | 27 #if SDL_ASSEMBLY_ROUTINES |
28 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) | |
28 #define MMX_ASMBLIT 1 | 29 #define MMX_ASMBLIT 1 |
30 #define GCC_ASMBLIT 1 | |
31 #elif defined(_MSC_VER) && (_MSC_VER >= 1200) && defined(_M_IX86) | |
32 #define MMX_ASMBLIT 1 | |
33 #define MSVC_ASMBLIT 1 | |
29 #endif | 34 #endif |
35 #endif /* SDL_ASSEMBLY_ROUTINES */ | |
30 | 36 |
31 /* Function to check the CPU flags */ | 37 /* Function to check the CPU flags */ |
32 #include "SDL_cpuinfo.h" | 38 #include "SDL_cpuinfo.h" |
33 #if MMX_ASMBLIT | 39 #if GCC_ASMBLIT |
34 #include "mmx.h" | 40 #include "mmx.h" |
41 #elif MSVC_ASMBLIT | |
42 #include <mmintrin.h> | |
43 #include <mm3dnow.h> | |
35 #endif | 44 #endif |
36 | 45 |
37 /* Functions to perform alpha blended blitting */ | 46 /* Functions to perform alpha blended blitting */ |
38 | 47 |
39 /* N->1 blending with per-surface alpha */ | 48 /* N->1 blending with per-surface alpha */ |
196 src += srcskip; | 205 src += srcskip; |
197 dst += dstskip; | 206 dst += dstskip; |
198 } | 207 } |
199 } | 208 } |
200 | 209 |
201 #if MMX_ASMBLIT | 210 #if GCC_ASMBLIT |
202 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ | 211 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ |
203 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) | 212 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) |
204 { | 213 { |
205 int width = info->d_width; | 214 int width = info->d_width; |
206 int height = info->d_height; | 215 int height = info->d_height; |
207 Uint32 *srcp = (Uint32 *)info->s_pixels; | 216 Uint32 *srcp = (Uint32 *)info->s_pixels; |
208 int srcskip = info->s_skip >> 2; | 217 int srcskip = info->s_skip >> 2; |
209 Uint32 *dstp = (Uint32 *)info->d_pixels; | 218 Uint32 *dstp = (Uint32 *)info->d_pixels; |
210 int dstskip = info->d_skip >> 2; | 219 int dstskip = info->d_skip >> 2; |
211 Uint8 load[8]; | 220 Uint32 dalpha = info->dst->Amask; |
212 | 221 Uint8 load[8]; |
213 *(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */ | 222 |
214 movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */ | 223 *(Uint64 *)load = 0x00fefefe00fefefeULL;/* alpha128 mask */ |
215 *(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */ | 224 movq_m2r(*load, mm4); /* alpha128 mask -> mm4 */ |
216 movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */ | 225 *(Uint64 *)load = 0x0001010100010101ULL;/* !alpha128 mask */ |
217 *(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */ | 226 movq_m2r(*load, mm3); /* !alpha128 mask -> mm3 */ |
218 movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */ | 227 movd_m2r(dalpha, mm7); /* dst alpha mask */ |
228 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ | |
219 while(height--) { | 229 while(height--) { |
220 DUFFS_LOOP_DOUBLE2( | 230 DUFFS_LOOP_DOUBLE2( |
221 { | 231 { |
222 Uint32 s = *srcp++; | 232 Uint32 s = *srcp++; |
223 Uint32 d = *dstp; | 233 Uint32 d = *dstp; |
224 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) | 234 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) |
225 + (s & d & 0x00010101)) | 0xff000000; | 235 + (s & d & 0x00010101)) | dalpha; |
226 },{ | 236 },{ |
227 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ | 237 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ |
228 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ | 238 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ |
229 | 239 |
230 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ | 240 movq_m2r((*srcp), mm1);/* 2 x src -> mm1(ARGBARGB) */ |
231 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ | 241 movq_r2r(mm1, mm5); /* 2 x src -> mm5(ARGBARGB) */ |
232 | 242 |
233 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ | 243 pand_r2r(mm4, mm6); /* dst & mask -> mm6 */ |
234 pand_r2r(mm4, mm5); /* src & mask -> mm5 */ | 244 pand_r2r(mm4, mm5); /* src & mask -> mm5 */ |
235 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ | 245 paddd_r2r(mm6, mm5); /* mm6 + mm5 -> mm5 */ |
236 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ | 246 pand_r2r(mm1, mm2); /* src & dst -> mm2 */ |
237 | 247 psrld_i2r(1, mm5); /* mm5 >> 1 -> mm5 */ |
238 pand_r2r(mm1, mm2); /* src & dst -> mm2 */ | 248 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ |
239 pand_r2r(mm3, mm2); /* mm2 & !mask -> mm2 */ | 249 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ |
240 paddd_r2r(mm5, mm2); /* mm5 + mm2 -> mm2 */ | 250 |
241 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ | 251 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ |
242 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ | 252 movq_r2m(mm2, (*dstp));/* mm2 -> 2 x dst pixels */ |
243 dstp += 2; | 253 dstp += 2; |
244 srcp += 2; | 254 srcp += 2; |
245 }, width); | 255 }, width); |
246 srcp += srcskip; | 256 srcp += srcskip; |
247 dstp += dstskip; | 257 dstp += dstskip; |
248 } | 258 } |
249 emms(); | 259 emms(); |
250 } | 260 } |
251 | 261 |
252 /* fast RGB888->(A)RGB888 blending with surface alpha */ | 262 /* fast RGB888->(A)RGB888 blending with surface alpha */ |
253 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) | 263 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) |
254 { | 264 { |
265 SDL_PixelFormat* df = info->dst; | |
255 unsigned alpha = info->src->alpha; | 266 unsigned alpha = info->src->alpha; |
256 if(alpha == 128) { | 267 |
268 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { | |
269 /* only call a128 version when R,G,B occupy lower bits */ | |
257 BlitRGBtoRGBSurfaceAlpha128MMX(info); | 270 BlitRGBtoRGBSurfaceAlpha128MMX(info); |
258 } else { | 271 } else { |
259 int width = info->d_width; | 272 int width = info->d_width; |
260 int height = info->d_height; | 273 int height = info->d_height; |
261 Uint32 *srcp = (Uint32 *)info->s_pixels; | 274 Uint32 *srcp = (Uint32 *)info->s_pixels; |
262 int srcskip = info->s_skip >> 2; | 275 int srcskip = info->s_skip >> 2; |
263 Uint32 *dstp = (Uint32 *)info->d_pixels; | 276 Uint32 *dstp = (Uint32 *)info->d_pixels; |
264 int dstskip = info->d_skip >> 2; | 277 int dstskip = info->d_skip >> 2; |
265 Uint8 load[8] = {alpha, alpha, alpha, alpha, | 278 |
266 alpha, alpha, alpha, alpha}; | 279 pxor_r2r(mm5, mm5); /* 0 -> mm5 */ |
267 | 280 /* form the alpha mult */ |
268 movq_m2r(*load, mm4); /* alpha -> mm4 */ | 281 movd_m2r(alpha, mm4); /* 0000000A -> mm4 */ |
269 *(Uint64 *)load = 0x00FF00FF00FF00FFULL; | 282 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ |
270 movq_m2r(*load, mm3); /* mask -> mm3 */ | 283 punpckldq_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ |
271 pand_r2r(mm3, mm4); /* mm4 & mask -> 0A0A0A0A -> mm4 */ | 284 alpha = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); |
272 *(Uint64 *)load = 0xFF000000FF000000ULL;/* dst alpha mask */ | 285 movd_m2r(alpha, mm0); /* 00000FFF -> mm0 */ |
273 movq_m2r(*load, mm7); /* dst alpha mask -> mm7 */ | 286 punpcklbw_r2r(mm0, mm0); /* 00FFFFFF -> mm0 */ |
287 pand_r2r(mm0, mm4); /* 0A0A0A0A -> mm4, minus 1 chan */ | |
288 /* at this point mm4 can be 000A0A0A or 0A0A0A00 or another combo */ | |
289 movd_m2r(df->Amask, mm7); /* dst alpha mask */ | |
290 punpckldq_r2r(mm7, mm7); /* dst alpha mask | dst alpha mask -> mm7 */ | |
274 | 291 |
275 while(height--) { | 292 while(height--) { |
276 DUFFS_LOOP_DOUBLE2({ | 293 DUFFS_LOOP_DOUBLE2({ |
277 /* One Pixel Blend */ | 294 /* One Pixel Blend */ |
278 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ | 295 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ |
279 punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */ | 296 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ |
280 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */ | 297 punpcklbw_r2r(mm5, mm1); /* 0A0R0G0B -> mm1(src) */ |
281 | 298 punpcklbw_r2r(mm5, mm2); /* 0A0R0G0B -> mm2(dst) */ |
282 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ | 299 |
283 movq_r2r(mm2, mm6);/* dst(ARGB) -> mm6 (0000ARGB)*/ | 300 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ |
284 punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */ | 301 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ |
285 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | 302 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ |
286 | 303 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ |
287 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ | 304 |
288 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ | 305 packuswb_r2r(mm5, mm2); /* ARGBARGB -> mm2 */ |
289 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ | 306 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ |
290 paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ | 307 movd_r2m(mm2, *dstp);/* mm2 -> pixel */ |
291 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | |
292 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ | |
293 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ | |
294 movd_r2m(mm2, *dstp);/* mm2 -> Pixel */ | |
295 ++srcp; | 308 ++srcp; |
296 ++dstp; | 309 ++dstp; |
297 },{ | 310 },{ |
298 /* Two Pixels Blend */ | 311 /* Two Pixels Blend */ |
299 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/ | 312 movq_m2r((*srcp), mm0);/* 2 x src -> mm0(ARGBARGB)*/ |
300 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ | 313 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ |
301 punpcklbw_r2r(mm0, mm0); /* low - AARRGGBB -> mm0 */ | 314 movq_r2r(mm0, mm1); /* 2 x src -> mm1(ARGBARGB) */ |
302 pand_r2r(mm3, mm0); /* 0A0R0G0B -> mm0(src1) */ | 315 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ |
303 punpckhbw_r2r(mm1, mm1); /* high - AARRGGBB -> mm1 */ | 316 |
304 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1(src2) */ | 317 punpcklbw_r2r(mm5, mm0); /* low - 0A0R0G0B -> mm0(src1) */ |
305 | 318 punpckhbw_r2r(mm5, mm1); /* high - 0A0R0G0B -> mm1(src2) */ |
306 movq_m2r((*dstp), mm2);/* 2 x dst -> mm2(ARGBARGB) */ | 319 punpcklbw_r2r(mm5, mm2); /* low - 0A0R0G0B -> mm2(dst1) */ |
307 movq_r2r(mm2, mm5); /* 2 x dst -> mm5(ARGBARGB) */ | 320 punpckhbw_r2r(mm5, mm6); /* high - 0A0R0G0B -> mm6(dst2) */ |
308 movq_r2r(mm2, mm6); /* 2 x dst -> mm6(ARGBARGB) */ | 321 |
309 punpcklbw_r2r(mm2, mm2); /* low - AARRGGBB -> mm2 */ | 322 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ |
310 punpckhbw_r2r(mm6, mm6); /* high - AARRGGBB -> mm6 */ | 323 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ |
311 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2(dst1) */ | 324 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ |
312 | 325 paddb_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ |
313 psubw_r2r(mm2, mm0);/* src1 - dst1 -> mm0 */ | 326 |
314 pmullw_r2r(mm4, mm0); /* mm0 * alpha -> mm0 */ | 327 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ |
315 pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6(dst2) */ | 328 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ |
316 psrlw_i2r(8, mm0); /* mm0 >> 8 -> mm1 */ | 329 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ |
317 psubw_r2r(mm6, mm1);/* src2 - dst2 -> mm1 */ | 330 paddb_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ |
318 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ | 331 |
319 paddw_r2r(mm0, mm2); /* mm0 + mm2(dst1) -> mm2 */ | 332 packuswb_r2r(mm6, mm2); /* ARGBARGB -> mm2 */ |
320 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm0 */ | 333 por_r2r(mm7, mm2); /* mm7(dst alpha) | mm2 -> mm2 */ |
321 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | 334 |
322 paddw_r2r(mm1, mm6); /* mm1 + mm6(dst2) -> mm6 */ | 335 movq_r2m(mm2, *dstp);/* mm2 -> 2 x pixel */ |
323 pand_r2r(mm3, mm6); /* 0A0R0G0B -> mm6 */ | 336 |
324 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ | 337 srcp += 2; |
325 packuswb_r2r(mm6, mm6); /* ARGBARGB -> mm6 */ | 338 dstp += 2; |
326 psrlq_i2r(32, mm2); /* mm2 >> 32 -> mm2 */ | 339 }, width); |
327 psllq_i2r(32, mm6); /* mm6 << 32 -> mm6 */ | |
328 por_r2r(mm6, mm2); /* mm6 | mm2 -> mm2 */ | |
329 por_r2r(mm7, mm2); /* mm7(full alpha) | mm2 -> mm2 */ | |
330 movq_r2m(mm2, *dstp);/* mm2 -> 2 x Pixel */ | |
331 srcp += 2; | |
332 dstp += 2; | |
333 }, width); | |
334 srcp += srcskip; | 340 srcp += srcskip; |
335 dstp += dstskip; | 341 dstp += dstskip; |
336 } | 342 } |
337 emms(); | 343 emms(); |
338 } | 344 } |
345 int height = info->d_height; | 351 int height = info->d_height; |
346 Uint32 *srcp = (Uint32 *)info->s_pixels; | 352 Uint32 *srcp = (Uint32 *)info->s_pixels; |
347 int srcskip = info->s_skip >> 2; | 353 int srcskip = info->s_skip >> 2; |
348 Uint32 *dstp = (Uint32 *)info->d_pixels; | 354 Uint32 *dstp = (Uint32 *)info->d_pixels; |
349 int dstskip = info->d_skip >> 2; | 355 int dstskip = info->d_skip >> 2; |
350 Uint32 alpha = 0; | 356 SDL_PixelFormat* sf = info->src; |
351 Uint8 load[8]; | 357 Uint32 amask = sf->Amask; |
352 | 358 |
353 *(Uint64 *)load = 0x00FF00FF00FF00FFULL; | 359 pxor_r2r(mm6, mm6); /* 0 -> mm6 */ |
354 movq_m2r(*load, mm3); /* mask -> mm2 */ | 360 /* form multiplication mask */ |
355 *(Uint64 *)load = 0x00FF000000000000ULL; | 361 movd_m2r(sf->Amask, mm7); /* 0000F000 -> mm7 */ |
356 movq_m2r(*load, mm7); /* dst alpha mask -> mm2 */ | 362 punpcklbw_r2r(mm7, mm7); /* FF000000 -> mm7 */ |
357 *(Uint64 *)load = 0x00FFFFFF00FFFFFFULL; | 363 pcmpeqb_r2r(mm0, mm0); /* FFFFFFFF -> mm0 */ |
358 movq_m2r(*load, mm0); /* alpha 255 mask -> mm0 */ | 364 movq_r2r(mm0, mm3); /* FFFFFFFF -> mm3 (for later) */ |
359 *(Uint64 *)load = 0xFF000000FF000000ULL; | 365 pxor_r2r(mm0, mm7); /* 00FFFFFF -> mm7 (mult mask) */ |
360 movq_m2r(*load, mm6); /* alpha 255 !mask -> mm6 */ | 366 /* form channel masks */ |
367 movq_r2r(mm7, mm0); /* 00FFFFFF -> mm0 */ | |
368 packsswb_r2r(mm6, mm0); /* 00000FFF -> mm0 (channel mask) */ | |
369 packsswb_r2r(mm6, mm3); /* 0000FFFF -> mm3 */ | |
370 pxor_r2r(mm0, mm3); /* 0000F000 -> mm3 (~channel mask) */ | |
371 /* get alpha channel shift */ | |
372 movd_m2r(sf->Ashift, mm5); /* Ashift -> mm5 */ | |
373 | |
361 while(height--) { | 374 while(height--) { |
362 DUFFS_LOOP4({ | 375 DUFFS_LOOP4({ |
363 alpha = *srcp; | 376 Uint32 alpha = *srcp & amask; |
364 alpha >>= 24; | |
365 /* FIXME: Here we special-case opaque alpha since the | 377 /* FIXME: Here we special-case opaque alpha since the |
366 compositioning used (>>8 instead of /255) doesn't handle | 378 compositioning used (>>8 instead of /255) doesn't handle |
367 it correctly. Also special-case alpha=0 for speed? | 379 it correctly. Also special-case alpha=0 for speed? |
368 Benchmark this! */ | 380 Benchmark this! */ |
369 if(alpha) { | 381 if(alpha == 0) { |
370 if(alpha == SDL_ALPHA_OPAQUE) { | 382 /* do nothing */ |
371 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ | 383 } else if(alpha == amask) { |
372 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ | 384 /* opaque alpha -- copy RGB, keep dst alpha */ |
373 pand_r2r(mm0, mm1); | 385 /* using MMX here to free up regular registers for other things */ |
374 pand_r2r(mm6, mm2); | 386 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ |
375 por_r2r(mm1, mm2); | 387 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ |
376 movd_r2m(mm2, (*dstp)); | 388 pand_r2r(mm0, mm1); /* src & chanmask -> mm1 */ |
377 } else { | 389 pand_r2r(mm3, mm2); /* dst & ~chanmask -> mm2 */ |
378 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ | 390 por_r2r(mm1, mm2); /* src | dst -> mm2 */ |
379 punpcklbw_r2r(mm1, mm1); /* AARRGGBB -> mm1 */ | 391 movd_r2m(mm2, (*dstp)); /* mm2 -> dst */ |
380 pand_r2r(mm3, mm1); /* 0A0R0G0B -> mm1 */ | 392 } else { |
381 | 393 movd_m2r((*srcp), mm1);/* src(ARGB) -> mm1 (0000ARGB)*/ |
382 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ | 394 punpcklbw_r2r(mm6, mm1); /* 0A0R0G0B -> mm1 */ |
383 punpcklbw_r2r(mm2, mm2); /* AARRGGBB -> mm2 */ | 395 |
384 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | 396 movd_m2r((*dstp), mm2);/* dst(ARGB) -> mm2 (0000ARGB)*/ |
385 | 397 punpcklbw_r2r(mm6, mm2); /* 0A0R0G0B -> mm2 */ |
386 movq_r2r(mm2, mm5);/* mm2(0A0R0G0B) -> mm5 */ | 398 |
387 pand_r2r(mm7, mm5); /* mm5 & dst alpha mask -> mm5(0A000000) */ | 399 __asm__ __volatile__ ( |
388 psrlq_i2r(24, mm5); /* mm5 >> 24 -> mm5 (0000A000)*/ | 400 "movd %0, %%mm4" |
389 | 401 : : "r" (alpha) ); /* 0000A000 -> mm4 */ |
390 movq_r2r(mm1, mm4);/* mm1(0A0R0G0B) -> mm4 */ | 402 psrld_r2r(mm5, mm4); /* mm4 >> mm5 -> mm4 (0000000A) */ |
391 psrlq_i2r(48, mm4); /* mm4 >> 48 -> mm4(0000000A) */ | 403 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ |
392 punpcklwd_r2r(mm4, mm4); /* 00000A0A -> mm4 */ | 404 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ |
393 punpcklwd_r2r(mm4, mm4); /* 0A0A0A0A -> mm4 */ | 405 pand_r2r(mm7, mm4); /* 000A0A0A -> mm4, preserve dst alpha on add */ |
394 | 406 |
395 /* blend */ | 407 /* blend */ |
396 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ | 408 psubw_r2r(mm2, mm1);/* src - dst -> mm1 */ |
397 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ | 409 pmullw_r2r(mm4, mm1); /* mm1 * alpha -> mm1 */ |
398 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1 */ | 410 psrlw_i2r(8, mm1); /* mm1 >> 8 -> mm1(000R0G0B) */ |
399 paddw_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ | 411 paddb_r2r(mm1, mm2); /* mm1 + mm2(dst) -> mm2 */ |
400 pand_r2r(mm3, mm2); /* 0A0R0G0B -> mm2 */ | 412 |
401 packuswb_r2r(mm2, mm2); /* ARGBARGB -> mm2 */ | 413 packuswb_r2r(mm6, mm2); /* 0000ARGB -> mm2 */ |
402 pand_r2r(mm0, mm2); /* 0RGB0RGB -> mm2 */ | 414 movd_r2m(mm2, *dstp);/* mm2 -> dst */ |
403 por_r2r(mm5, mm2); /* dst alpha | mm2 -> mm2 */ | |
404 movd_r2m(mm2, *dstp);/* mm2 -> dst */ | |
405 } | |
406 } | 415 } |
407 ++srcp; | 416 ++srcp; |
408 ++dstp; | 417 ++dstp; |
409 }, width); | 418 }, width); |
410 srcp += srcskip; | 419 srcp += srcskip; |
411 dstp += dstskip; | 420 dstp += dstskip; |
412 } | 421 } |
413 emms(); | 422 emms(); |
414 } | 423 } |
415 #endif | 424 /* End GCC_ASMBLIT */ |
425 | |
426 #elif MSVC_ASMBLIT | |
427 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */ | |
428 static void BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo *info) | |
429 { | |
430 int width = info->d_width; | |
431 int height = info->d_height; | |
432 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
433 int srcskip = info->s_skip >> 2; | |
434 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
435 int dstskip = info->d_skip >> 2; | |
436 Uint32 dalpha = info->dst->Amask; | |
437 | |
438 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta; | |
439 | |
440 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */ | |
441 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */ | |
442 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ | |
443 | |
444 while (height--) { | |
445 int n = width; | |
446 if ( n & 1 ) { | |
447 Uint32 s = *srcp++; | |
448 Uint32 d = *dstp; | |
449 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) | |
450 + (s & d & 0x00010101)) | dalpha; | |
451 n--; | |
452 } | |
453 | |
454 for (n >>= 1; n > 0; --n) { | |
455 dst1 = *(__m64*)dstp; /* 2 x dst -> dst1(ARGBARGB) */ | |
456 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ | |
457 | |
458 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB) */ | |
459 src2 = src1; /* 2 x src -> src2(ARGBARGB) */ | |
460 | |
461 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */ | |
462 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */ | |
463 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */ | |
464 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */ | |
465 | |
466 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */ | |
467 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */ | |
468 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */ | |
469 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */ | |
470 | |
471 *(__m64*)dstp = dst1; /* dst1 -> 2 x dst pixels */ | |
472 dstp += 2; | |
473 srcp += 2; | |
474 } | |
475 | |
476 srcp += srcskip; | |
477 dstp += dstskip; | |
478 } | |
479 _mm_empty(); | |
480 } | |
481 | |
482 /* fast RGB888->(A)RGB888 blending with surface alpha */ | |
483 static void BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo *info) | |
484 { | |
485 SDL_PixelFormat* df = info->dst; | |
486 Uint32 chanmask = df->Rmask | df->Gmask | df->Bmask; | |
487 unsigned alpha = info->src->alpha; | |
488 | |
489 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) { | |
490 /* only call a128 version when R,G,B occupy lower bits */ | |
491 BlitRGBtoRGBSurfaceAlpha128MMX(info); | |
492 } else { | |
493 int width = info->d_width; | |
494 int height = info->d_height; | |
495 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
496 int srcskip = info->s_skip >> 2; | |
497 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
498 int dstskip = info->d_skip >> 2; | |
499 Uint32 dalpha = df->Amask; | |
500 Uint32 amult; | |
501 | |
502 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta; | |
503 | |
504 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ | |
505 /* form the alpha mult */ | |
506 amult = alpha | (alpha << 8); | |
507 amult = amult | (amult << 16); | |
508 chanmask = (0xff << df->Rshift) | (0xff << df->Gshift) | (0xff << df->Bshift); | |
509 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */ | |
510 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */ | |
511 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */ | |
512 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */ | |
513 | |
514 while (height--) { | |
515 int n = width; | |
516 if (n & 1) { | |
517 /* One Pixel Blend */ | |
518 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB)*/ | |
519 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */ | |
520 | |
521 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ | |
522 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ | |
523 | |
524 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */ | |
525 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | |
526 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ | |
527 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */ | |
528 | |
529 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ | |
530 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ | |
531 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ | |
532 | |
533 ++srcp; | |
534 ++dstp; | |
535 | |
536 n--; | |
537 } | |
538 | |
539 for (n >>= 1; n > 0; --n) { | |
540 /* Two Pixels Blend */ | |
541 src1 = *(__m64*)srcp; /* 2 x src -> src1(ARGBARGB)*/ | |
542 src2 = src1; /* 2 x src -> src2(ARGBARGB) */ | |
543 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */ | |
544 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */ | |
545 | |
546 dst1 = *(__m64*)dstp;/* 2 x dst -> dst1(ARGBARGB) */ | |
547 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */ | |
548 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */ | |
549 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */ | |
550 | |
551 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ | |
552 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */ | |
553 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */ | |
554 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */ | |
555 | |
556 src2 = _mm_sub_pi16(src2, dst2);/* src2 - dst2 -> src2 */ | |
557 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | |
558 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */ | |
559 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */ | |
560 | |
561 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */ | |
562 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */ | |
563 | |
564 *(__m64*)dstp = dst1; /* dst1 -> 2 x pixel */ | |
565 | |
566 srcp += 2; | |
567 dstp += 2; | |
568 } | |
569 srcp += srcskip; | |
570 dstp += dstskip; | |
571 } | |
572 _mm_empty(); | |
573 } | |
574 } | |
575 | |
576 /* fast ARGB888->(A)RGB888 blending with pixel alpha */ | |
577 static void BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo *info) | |
578 { | |
579 int width = info->d_width; | |
580 int height = info->d_height; | |
581 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
582 int srcskip = info->s_skip >> 2; | |
583 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
584 int dstskip = info->d_skip >> 2; | |
585 SDL_PixelFormat* sf = info->src; | |
586 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; | |
587 Uint32 amask = sf->Amask; | |
588 Uint32 ashift = sf->Ashift; | |
589 Uint64 multmask; | |
590 | |
591 __m64 src1, dst1, mm_alpha, mm_zero, dmask; | |
592 | |
593 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ | |
594 multmask = ~(0xFFFFi64 << (ashift * 2)); | |
595 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ | |
596 | |
597 while(height--) { | |
598 DUFFS_LOOP4({ | |
599 Uint32 alpha = *srcp & amask; | |
600 if (alpha == 0) { | |
601 /* do nothing */ | |
602 } else if (alpha == amask) { | |
603 /* opaque alpha -- copy RGB, keep dst alpha */ | |
604 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); | |
605 } else { | |
606 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ | |
607 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ | |
608 | |
609 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ | |
610 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ | |
611 | |
612 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ | |
613 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ | |
614 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ | |
615 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ | |
616 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ | |
617 | |
618 /* blend */ | |
619 src1 = _mm_sub_pi16(src1, dst1);/* src1 - dst1 -> src1 */ | |
620 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src1 - dst1) * alpha -> src1 */ | |
621 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ | |
622 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1 -> dst1(0A0R0G0B) */ | |
623 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ | |
624 | |
625 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ | |
626 } | |
627 ++srcp; | |
628 ++dstp; | |
629 }, width); | |
630 srcp += srcskip; | |
631 dstp += dstskip; | |
632 } | |
633 _mm_empty(); | |
634 } | |
635 /* End MSVC_ASMBLIT */ | |
636 | |
637 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ | |
416 | 638 |
417 #if SDL_ALTIVEC_BLITTERS | 639 #if SDL_ALTIVEC_BLITTERS |
418 #if HAVE_ALTIVEC_H | 640 #if HAVE_ALTIVEC_H |
419 #include <altivec.h> | 641 #include <altivec.h> |
420 #endif | 642 #endif |
1324 srcp += srcskip; | 1546 srcp += srcskip; |
1325 dstp += dstskip; | 1547 dstp += dstskip; |
1326 } | 1548 } |
1327 } | 1549 } |
1328 | 1550 |
1329 #if MMX_ASMBLIT | 1551 #if GCC_ASMBLIT |
1330 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ | 1552 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ |
1331 inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) | 1553 inline static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) |
1332 { | 1554 { |
1333 int width = info->d_width; | 1555 int width = info->d_width; |
1334 int height = info->d_height; | 1556 int height = info->d_height; |
1335 Uint32 *srcp = (Uint32 *)info->s_pixels; | 1557 Uint32 *srcp = (Uint32 *)info->s_pixels; |
1336 int srcskip = info->s_skip >> 2; | 1558 int srcskip = info->s_skip >> 2; |
1337 Uint32 *dstp = (Uint32 *)info->d_pixels; | 1559 Uint32 *dstp = (Uint32 *)info->d_pixels; |
1338 int dstskip = info->d_skip >> 2; | 1560 int dstskip = info->d_skip >> 2; |
1339 | 1561 SDL_PixelFormat* sf = info->src; |
1340 Uint32 s; | 1562 Uint32 amask = sf->Amask; |
1341 Uint32 alpha; | |
1342 | 1563 |
1343 __asm__ ( | 1564 __asm__ ( |
1344 /* make mm6 all zeros. */ | 1565 /* make mm6 all zeros. */ |
1345 "pxor %%mm6, %%mm6\n" | 1566 "pxor %%mm6, %%mm6\n" |
1346 | 1567 |
1347 /* Make a mask to preserve the alpha. */ | 1568 /* Make a mask to preserve the alpha. */ |
1348 "pcmpeqb %%mm7, %%mm7\n\t" /* mm7(s) = FF FF FF FF | FF FF FF FF */ | 1569 "movd %0, %%mm7\n\t" /* 0000F000 -> mm7 */ |
1349 "psrlq $16, %%mm7\n\t" /* mm7(s) = 00 00 FF FF | FF FF FF FF */ | 1570 "punpcklbw %%mm7, %%mm7\n\t" /* FF000000 -> mm7 */ |
1350 | 1571 "pcmpeqb %%mm4, %%mm4\n\t" /* FFFFFFFF -> mm4 */ |
1351 : ); | 1572 "movq %%mm4, %%mm3\n\t" /* FFFFFFFF -> mm3 (for later) */ |
1573 "pxor %%mm4, %%mm7\n\t" /* 00FFFFFF -> mm7 (mult mask) */ | |
1574 | |
1575 /* form channel masks */ | |
1576 "movq %%mm7, %%mm4\n\t" /* 00FFFFFF -> mm4 */ | |
1577 "packsswb %%mm6, %%mm4\n\t" /* 00000FFF -> mm4 (channel mask) */ | |
1578 "packsswb %%mm6, %%mm3\n\t" /* 0000FFFF -> mm3 */ | |
1579 "pxor %%mm4, %%mm3\n\t" /* 0000F000 -> mm3 (~channel mask) */ | |
1580 | |
1581 /* get alpha channel shift */ | |
1582 "movd %1, %%mm5\n\t" /* Ashift -> mm5 */ | |
1583 | |
1584 : /* nothing */ : "m" (sf->Amask), "m" (sf->Ashift) ); | |
1352 | 1585 |
1353 while(height--) { | 1586 while(height--) { |
1354 | 1587 |
1355 DUFFS_LOOP4({ | 1588 DUFFS_LOOP4({ |
1589 Uint32 alpha; | |
1356 | 1590 |
1357 __asm__ ( | 1591 __asm__ ( |
1358 "prefetch 64(%0)\n" | 1592 "prefetch 64(%0)\n" |
1359 "prefetch 64(%1)\n" | 1593 "prefetch 64(%1)\n" |
1360 : : "r" (srcp), "r" (dstp) ); | 1594 : : "r" (srcp), "r" (dstp) ); |
1361 | 1595 |
1362 s = *srcp; | 1596 alpha = *srcp & amask; |
1363 alpha = s >> 24; | |
1364 /* FIXME: Here we special-case opaque alpha since the | 1597 /* FIXME: Here we special-case opaque alpha since the |
1365 compositioning used (>>8 instead of /255) doesn't handle | 1598 compositioning used (>>8 instead of /255) doesn't handle |
1366 it correctly. Also special-case alpha=0 for speed? | 1599 it correctly. Also special-case alpha=0 for speed? |
1367 Benchmark this! */ | 1600 Benchmark this! */ |
1368 | 1601 if(alpha == 0) { |
1369 if(alpha == SDL_ALPHA_OPAQUE) { | 1602 /* do nothing */ |
1370 *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); | 1603 } |
1604 else if(alpha == amask) { | |
1605 /* opaque alpha -- copy RGB, keep dst alpha */ | |
1606 /* using MMX here to free up regular registers for other things */ | |
1607 __asm__ ( | |
1608 "movd (%0), %%mm0\n\t" /* src(ARGB) -> mm0 (0000ARGB)*/ | |
1609 "movd (%1), %%mm1\n\t" /* dst(ARGB) -> mm1 (0000ARGB)*/ | |
1610 "pand %%mm4, %%mm0\n\t" /* src & chanmask -> mm0 */ | |
1611 "pand %%mm3, %%mm1\n\t" /* dst & ~chanmask -> mm2 */ | |
1612 "por %%mm0, %%mm1\n\t" /* src | dst -> mm1 */ | |
1613 "movd %%mm1, (%1) \n\t" /* mm1 -> dst */ | |
1614 | |
1615 : : "r" (srcp), "r" (dstp) ); | |
1371 } | 1616 } |
1372 | 1617 |
1373 else { | 1618 else { |
1374 __asm__ ( | 1619 __asm__ ( |
1375 /* load in the source, and dst. */ | 1620 /* load in the source, and dst. */ |
1381 /* if supporting pshufw */ | 1626 /* if supporting pshufw */ |
1382 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */ | 1627 /*"pshufw $0x55, %%mm0, %%mm2\n" */ /* mm2 = 0 As 0 As | 0 As 0 As */ |
1383 /*"psrlw $8, %%mm2\n" */ | 1628 /*"psrlw $8, %%mm2\n" */ |
1384 | 1629 |
1385 /* else: */ | 1630 /* else: */ |
1386 "movq %%mm0, %%mm2\n" | 1631 "movd %2, %%mm2\n" |
1387 "psrld $24, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ | 1632 "psrld %%mm5, %%mm2\n" /* mm2 = 0 0 0 0 | 0 0 0 As */ |
1388 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */ | 1633 "punpcklwd %%mm2, %%mm2\n" /* mm2 = 0 0 0 0 | 0 As 0 As */ |
1389 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */ | 1634 "punpckldq %%mm2, %%mm2\n" /* mm2 = 0 As 0 As | 0 As 0 As */ |
1635 "pand %%mm7, %%mm2\n" /* to preserve dest alpha */ | |
1390 | 1636 |
1391 /* move the colors into words. */ | 1637 /* move the colors into words. */ |
1392 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */ | 1638 "punpcklbw %%mm6, %%mm0\n" /* mm0 = 0 As 0 Rs | 0 Gs 0 Bs */ |
1393 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */ | 1639 "punpcklbw %%mm6, %%mm1\n" /* mm0 = 0 Ad 0 Rd | 0 Gd 0 Bd */ |
1394 | 1640 |
1395 /* src - dst */ | 1641 /* src - dst */ |
1396 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */ | 1642 "psubw %%mm1, %%mm0\n" /* mm0 = As-Ad Rs-Rd | Gs-Gd Bs-Bd */ |
1397 | 1643 |
1398 /* A * (src-dst) */ | 1644 /* A * (src-dst) */ |
1399 "pmullw %%mm2, %%mm0\n" /* mm0 = As*As-d As*Rs-d | As*Gs-d As*Bs-d */ | 1645 "pmullw %%mm2, %%mm0\n" /* mm0 = 0*As-d As*Rs-d | As*Gs-d As*Bs-d */ |
1400 "pand %%mm7, %%mm0\n" /* to preserve dest alpha */ | 1646 "psrlw $8, %%mm0\n" /* mm0 = 0>>8 Rc>>8 | Gc>>8 Bc>>8 */ |
1401 "psrlw $8, %%mm0\n" /* mm0 = Ac>>8 Rc>>8 | Gc>>8 Bc>>8 */ | 1647 "paddb %%mm1, %%mm0\n" /* mm0 = 0+Ad Rc+Rd | Gc+Gd Bc+Bd */ |
1402 "paddb %%mm1, %%mm0\n" /* mm0 = Ac+Ad Rc+Rd | Gc+Gd Bc+Bd */ | |
1403 | 1648 |
1404 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */ | 1649 "packuswb %%mm0, %%mm0\n" /* mm0 = | Ac Rc Gc Bc */ |
1405 | 1650 |
1406 "movd %%mm0, (%1)\n" /* result in mm0 */ | 1651 "movd %%mm0, (%1)\n" /* result in mm0 */ |
1407 | 1652 |
1408 : : "r" (srcp), "r" (dstp) ); | 1653 : : "r" (srcp), "r" (dstp), "r" (alpha) ); |
1409 | 1654 |
1410 } | 1655 } |
1411 ++srcp; | 1656 ++srcp; |
1412 ++dstp; | 1657 ++dstp; |
1413 }, width); | 1658 }, width); |
1417 | 1662 |
1418 __asm__ ( | 1663 __asm__ ( |
1419 "emms\n" | 1664 "emms\n" |
1420 : ); | 1665 : ); |
1421 } | 1666 } |
1422 #endif | 1667 /* End GCC_ASMBLIT*/ |
1668 | |
1669 #elif MSVC_ASMBLIT | |
1670 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */ | |
1671 static void BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo *info) | |
1672 { | |
1673 int width = info->d_width; | |
1674 int height = info->d_height; | |
1675 Uint32 *srcp = (Uint32 *)info->s_pixels; | |
1676 int srcskip = info->s_skip >> 2; | |
1677 Uint32 *dstp = (Uint32 *)info->d_pixels; | |
1678 int dstskip = info->d_skip >> 2; | |
1679 SDL_PixelFormat* sf = info->src; | |
1680 Uint32 chanmask = sf->Rmask | sf->Gmask | sf->Bmask; | |
1681 Uint32 amask = sf->Amask; | |
1682 Uint32 ashift = sf->Ashift; | |
1683 Uint64 multmask; | |
1684 | |
1685 __m64 src1, dst1, mm_alpha, mm_zero, dmask; | |
1686 | |
1687 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */ | |
1688 multmask = ~(0xFFFFi64 << (ashift * 2)); | |
1689 dmask = *(__m64*) &multmask; /* dst alpha mask -> dmask */ | |
1690 | |
1691 while(height--) { | |
1692 DUFFS_LOOP4({ | |
1693 Uint32 alpha; | |
1694 | |
1695 _m_prefetch(srcp + 16); | |
1696 _m_prefetch(dstp + 16); | |
1697 | |
1698 alpha = *srcp & amask; | |
1699 if (alpha == 0) { | |
1700 /* do nothing */ | |
1701 } else if (alpha == amask) { | |
1702 /* copy RGB, keep dst alpha */ | |
1703 *dstp = (*srcp & chanmask) | (*dstp & ~chanmask); | |
1704 } else { | |
1705 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB)*/ | |
1706 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */ | |
1707 | |
1708 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB)*/ | |
1709 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */ | |
1710 | |
1711 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */ | |
1712 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */ | |
1713 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ | |
1714 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ | |
1715 mm_alpha = _mm_and_si64(mm_alpha, dmask); /* 000A0A0A -> mm_alpha, preserve dst alpha on add */ | |
1716 | |
1717 /* blend */ | |
1718 src1 = _mm_sub_pi16(src1, dst1);/* src - dst -> src1 */ | |
1719 src1 = _mm_mullo_pi16(src1, mm_alpha); /* (src - dst) * alpha -> src1 */ | |
1720 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1(000R0G0B) */ | |
1721 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst) -> dst1(0A0R0G0B) */ | |
1722 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */ | |
1723 | |
1724 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */ | |
1725 } | |
1726 ++srcp; | |
1727 ++dstp; | |
1728 }, width); | |
1729 srcp += srcskip; | |
1730 dstp += dstskip; | |
1731 } | |
1732 _mm_empty(); | |
1733 } | |
1734 /* End MSVC_ASMBLIT */ | |
1735 | |
1736 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ | |
1423 | 1737 |
1424 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ | 1738 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */ |
1425 | 1739 |
1426 /* blend a single 16 bit pixel at 50% */ | 1740 /* blend a single 16 bit pixel at 50% */ |
1427 #define BLEND16_50(d, s, mask) \ | 1741 #define BLEND16_50(d, s, mask) \ |
1528 dstp += dstskip; | 1842 dstp += dstskip; |
1529 } | 1843 } |
1530 } | 1844 } |
1531 } | 1845 } |
1532 | 1846 |
1533 #if MMX_ASMBLIT | 1847 #if GCC_ASMBLIT |
1534 /* fast RGB565->RGB565 blending with surface alpha */ | 1848 /* fast RGB565->RGB565 blending with surface alpha */ |
1535 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) | 1849 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) |
1536 { | 1850 { |
1537 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ | 1851 unsigned alpha = info->src->alpha; /* downscale alpha to 5 bits */ |
1538 if(alpha == 128) { | 1852 if(alpha == 128) { |
1542 int height = info->d_height; | 1856 int height = info->d_height; |
1543 Uint16 *srcp = (Uint16 *)info->s_pixels; | 1857 Uint16 *srcp = (Uint16 *)info->s_pixels; |
1544 int srcskip = info->s_skip >> 1; | 1858 int srcskip = info->s_skip >> 1; |
1545 Uint16 *dstp = (Uint16 *)info->d_pixels; | 1859 Uint16 *dstp = (Uint16 *)info->d_pixels; |
1546 int dstskip = info->d_skip >> 1; | 1860 int dstskip = info->d_skip >> 1; |
1547 Uint32 s, d; | 1861 Uint32 s, d; |
1548 Uint8 load[8]; | 1862 Uint8 load[8]; |
1549 | 1863 |
1550 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | 1864 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ |
1551 *(Uint64 *)load = alpha; | 1865 *(Uint64 *)load = alpha; |
1552 alpha >>= 3; /* downscale alpha to 5 bits */ | 1866 alpha >>= 3; /* downscale alpha to 5 bits */ |
1553 | 1867 |
1554 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ | 1868 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ |
1555 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ | 1869 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ |
1556 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ | 1870 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ |
1871 /* position alpha to allow for mullo and mulhi on diff channels | |
1872 to reduce the number of operations */ | |
1873 psllq_i2r(3, mm0); | |
1557 | 1874 |
1558 /* Setup the 565 color channel masks */ | 1875 /* Setup the 565 color channel masks */ |
1559 *(Uint64 *)load = 0xF800F800F800F800ULL; | |
1560 movq_m2r(*load, mm1); /* MASKRED -> mm1 */ | |
1561 *(Uint64 *)load = 0x07E007E007E007E0ULL; | 1876 *(Uint64 *)load = 0x07E007E007E007E0ULL; |
1562 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ | 1877 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ |
1563 *(Uint64 *)load = 0x001F001F001F001FULL; | 1878 *(Uint64 *)load = 0x001F001F001F001FULL; |
1564 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ | 1879 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ |
1565 while(height--) { | 1880 while(height--) { |
1566 DUFFS_LOOP_QUATRO2( | 1881 DUFFS_LOOP_QUATRO2( |
1567 { | 1882 { |
1568 s = *srcp++; | 1883 s = *srcp++; |
1569 d = *dstp; | 1884 d = *dstp; |
1570 /* | 1885 /* |
1571 * shift out the middle component (green) to | 1886 * shift out the middle component (green) to |
1572 * the high 16 bits, and process all three RGB | 1887 * the high 16 bits, and process all three RGB |
1573 * components at the same time. | 1888 * components at the same time. |
1575 s = (s | s << 16) & 0x07e0f81f; | 1890 s = (s | s << 16) & 0x07e0f81f; |
1576 d = (d | d << 16) & 0x07e0f81f; | 1891 d = (d | d << 16) & 0x07e0f81f; |
1577 d += (s - d) * alpha >> 5; | 1892 d += (s - d) * alpha >> 5; |
1578 d &= 0x07e0f81f; | 1893 d &= 0x07e0f81f; |
1579 *dstp++ = d | d >> 16; | 1894 *dstp++ = d | d >> 16; |
1580 },{ | 1895 },{ |
1581 s = *srcp++; | 1896 s = *srcp++; |
1582 d = *dstp; | 1897 d = *dstp; |
1583 /* | 1898 /* |
1584 * shift out the middle component (green) to | 1899 * shift out the middle component (green) to |
1585 * the high 16 bits, and process all three RGB | 1900 * the high 16 bits, and process all three RGB |
1586 * components at the same time. | 1901 * components at the same time. |
1588 s = (s | s << 16) & 0x07e0f81f; | 1903 s = (s | s << 16) & 0x07e0f81f; |
1589 d = (d | d << 16) & 0x07e0f81f; | 1904 d = (d | d << 16) & 0x07e0f81f; |
1590 d += (s - d) * alpha >> 5; | 1905 d += (s - d) * alpha >> 5; |
1591 d &= 0x07e0f81f; | 1906 d &= 0x07e0f81f; |
1592 *dstp++ = d | d >> 16; | 1907 *dstp++ = d | d >> 16; |
1593 s = *srcp++; | 1908 s = *srcp++; |
1594 d = *dstp; | 1909 d = *dstp; |
1595 /* | 1910 /* |
1596 * shift out the middle component (green) to | 1911 * shift out the middle component (green) to |
1597 * the high 16 bits, and process all three RGB | 1912 * the high 16 bits, and process all three RGB |
1598 * components at the same time. | 1913 * components at the same time. |
1600 s = (s | s << 16) & 0x07e0f81f; | 1915 s = (s | s << 16) & 0x07e0f81f; |
1601 d = (d | d << 16) & 0x07e0f81f; | 1916 d = (d | d << 16) & 0x07e0f81f; |
1602 d += (s - d) * alpha >> 5; | 1917 d += (s - d) * alpha >> 5; |
1603 d &= 0x07e0f81f; | 1918 d &= 0x07e0f81f; |
1604 *dstp++ = d | d >> 16; | 1919 *dstp++ = d | d >> 16; |
1605 },{ | 1920 },{ |
1606 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ | 1921 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ |
1607 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ | 1922 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ |
1608 | 1923 |
1609 /* RED */ | 1924 /* red -- does not need a mask since the right shift clears |
1610 movq_r2r(mm2, mm5); /* src -> mm5 */ | 1925 the uninteresting bits */ |
1611 pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */ | 1926 movq_r2r(mm2, mm5); /* src -> mm5 */ |
1612 psrlq_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ | 1927 movq_r2r(mm3, mm6); /* dst -> mm6 */ |
1613 | 1928 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 [000r 000r 000r 000r] */ |
1614 movq_r2r(mm3, mm6); /* dst -> mm6 */ | 1929 psrlw_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ |
1615 pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */ | 1930 |
1616 psrlq_i2r(11, mm6); /* mm6 >> 11 -> mm6 [000r 000r 000r 000r] */ | 1931 /* blend */ |
1617 | 1932 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ |
1618 /* blend */ | 1933 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ |
1619 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | 1934 /* alpha used is actually 11 bits |
1620 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | 1935 11 + 5 = 16 bits, so the sign bits are lost */ |
1621 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | 1936 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ |
1622 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | 1937 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ |
1623 psllq_i2r(11, mm6); /* mm6 << 11 -> mm6 */ | 1938 psllw_i2r(11, mm6); /* mm6 << 11 -> mm6 */ |
1624 pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */ | 1939 |
1625 | 1940 movq_r2r(mm6, mm1); /* save new reds in dsts */ |
1626 movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */ | 1941 |
1627 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ | 1942 /* green -- process the bits in place */ |
1628 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */ | 1943 movq_r2r(mm2, mm5); /* src -> mm5 */ |
1629 por_r2r(mm6, mm3); /* save new reds in dsts */ | 1944 movq_r2r(mm3, mm6); /* dst -> mm6 */ |
1630 | 1945 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ |
1631 /* green */ | 1946 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ |
1632 movq_r2r(mm2, mm5); /* src -> mm5 */ | 1947 |
1633 pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */ | 1948 /* blend */ |
1634 psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */ | 1949 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ |
1635 | 1950 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ |
1636 movq_r2r(mm3, mm6); /* dst -> mm6 */ | 1951 /* 11 + 11 - 16 = 6 bits, so all the lower uninteresting |
1637 pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */ | 1952 bits are gone and the sign bits present */ |
1638 psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */ | 1953 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ |
1639 | 1954 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ |
1640 /* blend */ | 1955 |
1641 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | 1956 por_r2r(mm6, mm1); /* save new greens in dsts */ |
1642 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | 1957 |
1643 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | 1958 /* blue */ |
1644 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | 1959 movq_r2r(mm2, mm5); /* src -> mm5 */ |
1645 psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */ | 1960 movq_r2r(mm3, mm6); /* dst -> mm6 */ |
1646 pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */ | 1961 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ |
1647 | 1962 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ |
1648 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ | 1963 |
1649 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ | 1964 /* blend */ |
1650 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */ | 1965 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ |
1651 por_r2r(mm6, mm3); /* save new greens in dsts */ | 1966 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ |
1652 | 1967 /* 11 + 5 = 16 bits, so the sign bits are lost and |
1653 /* blue */ | 1968 the interesting bits will need to be MASKed */ |
1654 movq_r2r(mm2, mm5); /* src -> mm5 */ | 1969 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ |
1655 pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */ | 1970 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ |
1656 | 1971 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ |
1657 movq_r2r(mm3, mm6); /* dst -> mm6 */ | 1972 |
1658 pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ | 1973 por_r2r(mm6, mm1); /* save new blues in dsts */ |
1659 | 1974 |
1660 /* blend */ | 1975 movq_r2m(mm1, *dstp); /* mm1 -> 4 dst pixels */ |
1661 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | 1976 |
1662 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | 1977 srcp += 4; |
1663 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | 1978 dstp += 4; |
1664 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | 1979 }, width); |
1665 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */ | |
1666 | |
1667 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ | |
1668 por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */ | |
1669 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */ | |
1670 por_r2r(mm6, mm3); /* save new blues in dsts */ | |
1671 | |
1672 movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */ | |
1673 | |
1674 srcp += 4; | |
1675 dstp += 4; | |
1676 }, width); | |
1677 srcp += srcskip; | 1980 srcp += srcskip; |
1678 dstp += dstskip; | 1981 dstp += dstskip; |
1679 } | 1982 } |
1680 emms(); | 1983 emms(); |
1681 } | 1984 } |
1692 int height = info->d_height; | 1995 int height = info->d_height; |
1693 Uint16 *srcp = (Uint16 *)info->s_pixels; | 1996 Uint16 *srcp = (Uint16 *)info->s_pixels; |
1694 int srcskip = info->s_skip >> 1; | 1997 int srcskip = info->s_skip >> 1; |
1695 Uint16 *dstp = (Uint16 *)info->d_pixels; | 1998 Uint16 *dstp = (Uint16 *)info->d_pixels; |
1696 int dstskip = info->d_skip >> 1; | 1999 int dstskip = info->d_skip >> 1; |
1697 Uint32 s, d; | 2000 Uint32 s, d; |
1698 Uint8 load[8]; | 2001 Uint8 load[8]; |
1699 | 2002 |
1700 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | 2003 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ |
1701 *(Uint64 *)load = alpha; | 2004 *(Uint64 *)load = alpha; |
1702 alpha >>= 3; /* downscale alpha to 5 bits */ | 2005 alpha >>= 3; /* downscale alpha to 5 bits */ |
1703 | 2006 |
1704 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ | 2007 movq_m2r(*load, mm0); /* alpha(0000000A) -> mm0 */ |
1705 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ | 2008 punpcklwd_r2r(mm0, mm0); /* 00000A0A -> mm0 */ |
1706 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ | 2009 punpcklwd_r2r(mm0, mm0); /* 0A0A0A0A -> mm0 */ |
1707 | 2010 /* position alpha to allow for mullo and mulhi on diff channels |
1708 /* Setup the 555 color channel masks */ | 2011 to reduce the number of operations */ |
1709 *(Uint64 *)load = 0x7C007C007C007C00ULL; | 2012 psllq_i2r(3, mm0); |
1710 movq_m2r(*load, mm1); /* MASKRED -> mm1 */ | 2013 |
2014 /* Setup the 555 color channel masks */ | |
1711 *(Uint64 *)load = 0x03E003E003E003E0ULL; | 2015 *(Uint64 *)load = 0x03E003E003E003E0ULL; |
1712 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ | 2016 movq_m2r(*load, mm4); /* MASKGREEN -> mm4 */ |
1713 *(Uint64 *)load = 0x001F001F001F001FULL; | 2017 *(Uint64 *)load = 0x001F001F001F001FULL; |
1714 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ | 2018 movq_m2r(*load, mm7); /* MASKBLUE -> mm7 */ |
1715 while(height--) { | 2019 while(height--) { |
1716 DUFFS_LOOP_QUATRO2( | 2020 DUFFS_LOOP_QUATRO2( |
1717 { | 2021 { |
1718 s = *srcp++; | 2022 s = *srcp++; |
1719 d = *dstp; | 2023 d = *dstp; |
1720 /* | 2024 /* |
1721 * shift out the middle component (green) to | 2025 * shift out the middle component (green) to |
1722 * the high 16 bits, and process all three RGB | 2026 * the high 16 bits, and process all three RGB |
1723 * components at the same time. | 2027 * components at the same time. |
1725 s = (s | s << 16) & 0x03e07c1f; | 2029 s = (s | s << 16) & 0x03e07c1f; |
1726 d = (d | d << 16) & 0x03e07c1f; | 2030 d = (d | d << 16) & 0x03e07c1f; |
1727 d += (s - d) * alpha >> 5; | 2031 d += (s - d) * alpha >> 5; |
1728 d &= 0x03e07c1f; | 2032 d &= 0x03e07c1f; |
1729 *dstp++ = d | d >> 16; | 2033 *dstp++ = d | d >> 16; |
1730 },{ | 2034 },{ |
1731 s = *srcp++; | 2035 s = *srcp++; |
1732 d = *dstp; | 2036 d = *dstp; |
1733 /* | 2037 /* |
1734 * shift out the middle component (green) to | 2038 * shift out the middle component (green) to |
1735 * the high 16 bits, and process all three RGB | 2039 * the high 16 bits, and process all three RGB |
1736 * components at the same time. | 2040 * components at the same time. |
1750 s = (s | s << 16) & 0x03e07c1f; | 2054 s = (s | s << 16) & 0x03e07c1f; |
1751 d = (d | d << 16) & 0x03e07c1f; | 2055 d = (d | d << 16) & 0x03e07c1f; |
1752 d += (s - d) * alpha >> 5; | 2056 d += (s - d) * alpha >> 5; |
1753 d &= 0x03e07c1f; | 2057 d &= 0x03e07c1f; |
1754 *dstp++ = d | d >> 16; | 2058 *dstp++ = d | d >> 16; |
1755 },{ | 2059 },{ |
1756 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ | 2060 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ |
1757 movq_m2r((*srcp), mm2);/* 4 src pixels -> mm2 */ | 2061 movq_m2r((*dstp), mm3);/* 4 dst pixels -> mm3 */ |
1758 | 2062 |
1759 /* RED */ | 2063 /* red -- process the bits in place */ |
1760 movq_r2r(mm2, mm5); /* src -> mm5 */ | 2064 psllq_i2r(5, mm4); /* turn MASKGREEN into MASKRED */ |
1761 pand_r2r(mm1 , mm5); /* src & MASKRED -> mm5 */ | 2065 /* by reusing the GREEN mask we free up another mmx |
1762 psrlq_i2r(10, mm5); /* mm5 >> 10 -> mm5 [000r 000r 000r 000r] */ | 2066 register to accumulate the result */ |
1763 | 2067 |
1764 movq_r2r(mm3, mm6); /* dst -> mm6 */ | 2068 movq_r2r(mm2, mm5); /* src -> mm5 */ |
1765 pand_r2r(mm1 , mm6); /* dst & MASKRED -> mm6 */ | 2069 movq_r2r(mm3, mm6); /* dst -> mm6 */ |
1766 psrlq_i2r(10, mm6); /* mm6 >> 10 -> mm6 [000r 000r 000r 000r] */ | 2070 pand_r2r(mm4, mm5); /* src & MASKRED -> mm5 */ |
1767 | 2071 pand_r2r(mm4, mm6); /* dst & MASKRED -> mm6 */ |
1768 /* blend */ | 2072 |
1769 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | 2073 /* blend */ |
1770 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | 2074 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ |
1771 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | 2075 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ |
1772 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | 2076 /* 11 + 15 - 16 = 10 bits, uninteresting bits will be |
1773 psllq_i2r(10, mm6); /* mm6 << 10 -> mm6 */ | 2077 cleared by a MASK below */ |
1774 pand_r2r(mm1, mm6); /* mm6 & MASKRED -> mm6 */ | 2078 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ |
1775 | 2079 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ |
1776 movq_r2r(mm4, mm5); /* MASKGREEN -> mm5 */ | 2080 pand_r2r(mm4, mm6); /* mm6 & MASKRED -> mm6 */ |
1777 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ | 2081 |
1778 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKRED) -> mm3 */ | 2082 psrlq_i2r(5, mm4); /* turn MASKRED back into MASKGREEN */ |
1779 por_r2r(mm6, mm3); /* save new reds in dsts */ | 2083 |
1780 | 2084 movq_r2r(mm6, mm1); /* save new reds in dsts */ |
1781 /* green */ | 2085 |
1782 movq_r2r(mm2, mm5); /* src -> mm5 */ | 2086 /* green -- process the bits in place */ |
1783 pand_r2r(mm4 , mm5); /* src & MASKGREEN -> mm5 */ | 2087 movq_r2r(mm2, mm5); /* src -> mm5 */ |
1784 psrlq_i2r(5, mm5); /* mm5 >> 5 -> mm5 [000g 000g 000g 000g] */ | 2088 movq_r2r(mm3, mm6); /* dst -> mm6 */ |
1785 | 2089 pand_r2r(mm4, mm5); /* src & MASKGREEN -> mm5 */ |
1786 movq_r2r(mm3, mm6); /* dst -> mm6 */ | 2090 pand_r2r(mm4, mm6); /* dst & MASKGREEN -> mm6 */ |
1787 pand_r2r(mm4 , mm6); /* dst & MASKGREEN -> mm6 */ | 2091 |
1788 psrlq_i2r(5, mm6); /* mm6 >> 5 -> mm6 [000g 000g 000g 000g] */ | 2092 /* blend */ |
1789 | 2093 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ |
1790 /* blend */ | 2094 pmulhw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ |
1791 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | 2095 /* 11 + 10 - 16 = 5 bits, so all the lower uninteresting |
1792 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | 2096 bits are gone and the sign bits present */ |
1793 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | 2097 psllw_i2r(5, mm5); /* mm5 << 5 -> mm5 */ |
1794 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | 2098 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ |
1795 psllq_i2r(5, mm6); /* mm6 << 5 -> mm6 */ | 2099 |
1796 pand_r2r(mm4, mm6); /* mm6 & MASKGREEN -> mm6 */ | 2100 por_r2r(mm6, mm1); /* save new greens in dsts */ |
1797 | 2101 |
1798 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ | 2102 /* blue */ |
1799 por_r2r(mm7, mm5); /* MASKBLUE | mm5 -> mm5 */ | 2103 movq_r2r(mm2, mm5); /* src -> mm5 */ |
1800 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKGREEN) -> mm3 */ | 2104 movq_r2r(mm3, mm6); /* dst -> mm6 */ |
1801 por_r2r(mm6, mm3); /* save new greens in dsts */ | 2105 pand_r2r(mm7, mm5); /* src & MASKBLUE -> mm5[000b 000b 000b 000b] */ |
1802 | 2106 pand_r2r(mm7, mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ |
1803 /* blue */ | 2107 |
1804 movq_r2r(mm2, mm5); /* src -> mm5 */ | 2108 /* blend */ |
1805 pand_r2r(mm7 , mm5); /* src & MASKRED -> mm5[000b 000b 000b 000b] */ | 2109 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ |
1806 | 2110 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ |
1807 movq_r2r(mm3, mm6); /* dst -> mm6 */ | 2111 /* 11 + 5 = 16 bits, so the sign bits are lost and |
1808 pand_r2r(mm7 , mm6); /* dst & MASKBLUE -> mm6[000b 000b 000b 000b] */ | 2112 the interesting bits will need to be MASKed */ |
1809 | 2113 psrlw_i2r(11, mm5); /* mm5 >> 11 -> mm5 */ |
1810 /* blend */ | 2114 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ |
1811 psubw_r2r(mm6, mm5);/* src - dst -> mm5 */ | 2115 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6[000b 000b 000b 000b] */ |
1812 pmullw_r2r(mm0, mm5); /* mm5 * alpha -> mm5 */ | 2116 |
1813 psrlw_i2r(8, mm5); /* mm5 >> 8 -> mm5 */ | 2117 por_r2r(mm6, mm1); /* save new blues in dsts */ |
1814 paddw_r2r(mm5, mm6); /* mm5 + mm6(dst) -> mm6 */ | 2118 |
1815 pand_r2r(mm7, mm6); /* mm6 & MASKBLUE -> mm6 */ | 2119 movq_r2m(mm1, *dstp);/* mm1 -> 4 dst pixels */ |
1816 | 2120 |
1817 movq_r2r(mm1, mm5); /* MASKRED -> mm5 */ | 2121 srcp += 4; |
1818 por_r2r(mm4, mm5); /* MASKGREEN | mm5 -> mm5 */ | 2122 dstp += 4; |
1819 pand_r2r(mm5, mm3); /* mm3 & mm5(!MASKBLUE) -> mm3 */ | 2123 }, width); |
1820 por_r2r(mm6, mm3); /* save new blues in dsts */ | |
1821 | |
1822 movq_r2m(mm3, *dstp);/* mm3 -> 4 dst pixels */ | |
1823 | |
1824 srcp += 4; | |
1825 dstp += 4; | |
1826 }, width); | |
1827 srcp += srcskip; | 2124 srcp += srcskip; |
1828 dstp += dstskip; | 2125 dstp += dstskip; |
1829 } | 2126 } |
1830 emms(); | 2127 emms(); |
1831 } | 2128 } |
1832 } | 2129 } |
1833 #endif | 2130 /* End GCC_ASMBLIT */ |
2131 | |
2132 #elif MSVC_ASMBLIT | |
2133 /* fast RGB565->RGB565 blending with surface alpha */ | |
2134 static void Blit565to565SurfaceAlphaMMX(SDL_BlitInfo *info) | |
2135 { | |
2136 unsigned alpha = info->src->alpha; | |
2137 if(alpha == 128) { | |
2138 Blit16to16SurfaceAlpha128(info, 0xf7de); | |
2139 } else { | |
2140 int width = info->d_width; | |
2141 int height = info->d_height; | |
2142 Uint16 *srcp = (Uint16 *)info->s_pixels; | |
2143 int srcskip = info->s_skip >> 1; | |
2144 Uint16 *dstp = (Uint16 *)info->d_pixels; | |
2145 int dstskip = info->d_skip >> 1; | |
2146 Uint32 s, d; | |
2147 | |
2148 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha; | |
2149 | |
2150 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | |
2151 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ | |
2152 alpha >>= 3; /* downscale alpha to 5 bits */ | |
2153 | |
2154 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ | |
2155 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ | |
2156 /* position alpha to allow for mullo and mulhi on diff channels | |
2157 to reduce the number of operations */ | |
2158 mm_alpha = _mm_slli_si64(mm_alpha, 3); | |
2159 | |
2160 /* Setup the 565 color channel masks */ | |
2161 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */ | |
2162 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ | |
2163 | |
2164 while(height--) { | |
2165 DUFFS_LOOP_QUATRO2( | |
2166 { | |
2167 s = *srcp++; | |
2168 d = *dstp; | |
2169 /* | |
2170 * shift out the middle component (green) to | |
2171 * the high 16 bits, and process all three RGB | |
2172 * components at the same time. | |
2173 */ | |
2174 s = (s | s << 16) & 0x07e0f81f; | |
2175 d = (d | d << 16) & 0x07e0f81f; | |
2176 d += (s - d) * alpha >> 5; | |
2177 d &= 0x07e0f81f; | |
2178 *dstp++ = d | d >> 16; | |
2179 },{ | |
2180 s = *srcp++; | |
2181 d = *dstp; | |
2182 /* | |
2183 * shift out the middle component (green) to | |
2184 * the high 16 bits, and process all three RGB | |
2185 * components at the same time. | |
2186 */ | |
2187 s = (s | s << 16) & 0x07e0f81f; | |
2188 d = (d | d << 16) & 0x07e0f81f; | |
2189 d += (s - d) * alpha >> 5; | |
2190 d &= 0x07e0f81f; | |
2191 *dstp++ = d | d >> 16; | |
2192 s = *srcp++; | |
2193 d = *dstp; | |
2194 /* | |
2195 * shift out the middle component (green) to | |
2196 * the high 16 bits, and process all three RGB | |
2197 * components at the same time. | |
2198 */ | |
2199 s = (s | s << 16) & 0x07e0f81f; | |
2200 d = (d | d << 16) & 0x07e0f81f; | |
2201 d += (s - d) * alpha >> 5; | |
2202 d &= 0x07e0f81f; | |
2203 *dstp++ = d | d >> 16; | |
2204 },{ | |
2205 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ | |
2206 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ | |
2207 | |
2208 /* red */ | |
2209 src2 = src1; | |
2210 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */ | |
2211 | |
2212 dst2 = dst1; | |
2213 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */ | |
2214 | |
2215 /* blend */ | |
2216 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | |
2217 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | |
2218 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ | |
2219 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | |
2220 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */ | |
2221 | |
2222 mm_res = dst2; /* RED -> mm_res */ | |
2223 | |
2224 /* green -- process the bits in place */ | |
2225 src2 = src1; | |
2226 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ | |
2227 | |
2228 dst2 = dst1; | |
2229 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ | |
2230 | |
2231 /* blend */ | |
2232 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | |
2233 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | |
2234 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ | |
2235 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | |
2236 | |
2237 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ | |
2238 | |
2239 /* blue */ | |
2240 src2 = src1; | |
2241 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ | |
2242 | |
2243 dst2 = dst1; | |
2244 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ | |
2245 | |
2246 /* blend */ | |
2247 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | |
2248 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | |
2249 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ | |
2250 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | |
2251 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ | |
2252 | |
2253 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ | |
2254 | |
2255 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ | |
2256 | |
2257 srcp += 4; | |
2258 dstp += 4; | |
2259 }, width); | |
2260 srcp += srcskip; | |
2261 dstp += dstskip; | |
2262 } | |
2263 _mm_empty(); | |
2264 } | |
2265 } | |
2266 | |
2267 /* fast RGB555->RGB555 blending with surface alpha */ | |
2268 static void Blit555to555SurfaceAlphaMMX(SDL_BlitInfo *info) | |
2269 { | |
2270 unsigned alpha = info->src->alpha; | |
2271 if(alpha == 128) { | |
2272 Blit16to16SurfaceAlpha128(info, 0xfbde); | |
2273 } else { | |
2274 int width = info->d_width; | |
2275 int height = info->d_height; | |
2276 Uint16 *srcp = (Uint16 *)info->s_pixels; | |
2277 int srcskip = info->s_skip >> 1; | |
2278 Uint16 *dstp = (Uint16 *)info->d_pixels; | |
2279 int dstskip = info->d_skip >> 1; | |
2280 Uint32 s, d; | |
2281 | |
2282 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha; | |
2283 | |
2284 alpha &= ~(1+2+4); /* cut alpha to get the exact same behaviour */ | |
2285 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */ | |
2286 alpha >>= 3; /* downscale alpha to 5 bits */ | |
2287 | |
2288 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */ | |
2289 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */ | |
2290 /* position alpha to allow for mullo and mulhi on diff channels | |
2291 to reduce the number of operations */ | |
2292 mm_alpha = _mm_slli_si64(mm_alpha, 3); | |
2293 | |
2294 /* Setup the 555 color channel masks */ | |
2295 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */ | |
2296 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */ | |
2297 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */ | |
2298 | |
2299 while(height--) { | |
2300 DUFFS_LOOP_QUATRO2( | |
2301 { | |
2302 s = *srcp++; | |
2303 d = *dstp; | |
2304 /* | |
2305 * shift out the middle component (green) to | |
2306 * the high 16 bits, and process all three RGB | |
2307 * components at the same time. | |
2308 */ | |
2309 s = (s | s << 16) & 0x03e07c1f; | |
2310 d = (d | d << 16) & 0x03e07c1f; | |
2311 d += (s - d) * alpha >> 5; | |
2312 d &= 0x03e07c1f; | |
2313 *dstp++ = d | d >> 16; | |
2314 },{ | |
2315 s = *srcp++; | |
2316 d = *dstp; | |
2317 /* | |
2318 * shift out the middle component (green) to | |
2319 * the high 16 bits, and process all three RGB | |
2320 * components at the same time. | |
2321 */ | |
2322 s = (s | s << 16) & 0x03e07c1f; | |
2323 d = (d | d << 16) & 0x03e07c1f; | |
2324 d += (s - d) * alpha >> 5; | |
2325 d &= 0x03e07c1f; | |
2326 *dstp++ = d | d >> 16; | |
2327 s = *srcp++; | |
2328 d = *dstp; | |
2329 /* | |
2330 * shift out the middle component (green) to | |
2331 * the high 16 bits, and process all three RGB | |
2332 * components at the same time. | |
2333 */ | |
2334 s = (s | s << 16) & 0x03e07c1f; | |
2335 d = (d | d << 16) & 0x03e07c1f; | |
2336 d += (s - d) * alpha >> 5; | |
2337 d &= 0x03e07c1f; | |
2338 *dstp++ = d | d >> 16; | |
2339 },{ | |
2340 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */ | |
2341 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */ | |
2342 | |
2343 /* red -- process the bits in place */ | |
2344 src2 = src1; | |
2345 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */ | |
2346 | |
2347 dst2 = dst1; | |
2348 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */ | |
2349 | |
2350 /* blend */ | |
2351 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | |
2352 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | |
2353 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ | |
2354 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | |
2355 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */ | |
2356 | |
2357 mm_res = dst2; /* RED -> mm_res */ | |
2358 | |
2359 /* green -- process the bits in place */ | |
2360 src2 = src1; | |
2361 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */ | |
2362 | |
2363 dst2 = dst1; | |
2364 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */ | |
2365 | |
2366 /* blend */ | |
2367 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | |
2368 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | |
2369 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */ | |
2370 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | |
2371 | |
2372 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */ | |
2373 | |
2374 /* blue */ | |
2375 src2 = src1; /* src -> src2 */ | |
2376 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */ | |
2377 | |
2378 dst2 = dst1; /* dst -> dst2 */ | |
2379 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */ | |
2380 | |
2381 /* blend */ | |
2382 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */ | |
2383 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */ | |
2384 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */ | |
2385 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */ | |
2386 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */ | |
2387 | |
2388 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */ | |
2389 | |
2390 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */ | |
2391 | |
2392 srcp += 4; | |
2393 dstp += 4; | |
2394 }, width); | |
2395 srcp += srcskip; | |
2396 dstp += dstskip; | |
2397 } | |
2398 _mm_empty(); | |
2399 } | |
2400 } | |
2401 #endif /* GCC_ASMBLIT, MSVC_ASMBLIT */ | |
1834 | 2402 |
1835 /* fast RGB565->RGB565 blending with surface alpha */ | 2403 /* fast RGB565->RGB565 blending with surface alpha */ |
1836 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) | 2404 static void Blit565to565SurfaceAlpha(SDL_BlitInfo *info) |
1837 { | 2405 { |
1838 unsigned alpha = info->src->alpha; | 2406 unsigned alpha = info->src->alpha; |
2175 | 2743 |
2176 case 4: | 2744 case 4: |
2177 if(sf->Rmask == df->Rmask | 2745 if(sf->Rmask == df->Rmask |
2178 && sf->Gmask == df->Gmask | 2746 && sf->Gmask == df->Gmask |
2179 && sf->Bmask == df->Bmask | 2747 && sf->Bmask == df->Bmask |
2180 && (sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff | |
2181 && sf->BytesPerPixel == 4) | 2748 && sf->BytesPerPixel == 4) |
2182 { | 2749 { |
2183 #if MMX_ASMBLIT | 2750 #if MMX_ASMBLIT |
2184 if(SDL_HasMMX()) | 2751 if(sf->Rshift % 8 == 0 |
2185 return BlitRGBtoRGBSurfaceAlphaMMX; | 2752 && sf->Gshift % 8 == 0 |
2753 && sf->Bshift % 8 == 0 | |
2754 && SDL_HasMMX()) | |
2755 return BlitRGBtoRGBSurfaceAlphaMMX; | |
2756 #endif | |
2757 if((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) | |
2758 { | |
2759 #if USE_ALTIVEC_BLITTERS | |
2760 if(SDL_HasAltiVec()) | |
2761 return BlitRGBtoRGBSurfaceAlphaAltivec; | |
2762 #endif | |
2763 return BlitRGBtoRGBSurfaceAlpha; | |
2764 } | |
2765 } | |
2766 #if SDL_ALTIVEC_BLITTERS | |
2767 if((sf->BytesPerPixel == 4) && | |
2768 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) | |
2769 return Blit32to32SurfaceAlphaAltivec; | |
2186 else | 2770 else |
2187 #endif | 2771 #endif |
2188 #if SDL_ALTIVEC_BLITTERS | 2772 return BlitNtoNSurfaceAlpha; |
2189 if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) | |
2190 return BlitRGBtoRGBSurfaceAlphaAltivec; | |
2191 else | |
2192 #endif | |
2193 return BlitRGBtoRGBSurfaceAlpha; | |
2194 } | |
2195 else | |
2196 #if SDL_ALTIVEC_BLITTERS | |
2197 if((sf->BytesPerPixel == 4) && | |
2198 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) | |
2199 return Blit32to32SurfaceAlphaAltivec; | |
2200 else | |
2201 #endif | |
2202 return BlitNtoNSurfaceAlpha; | |
2203 | 2773 |
2204 case 3: | 2774 case 3: |
2205 default: | 2775 default: |
2206 return BlitNtoNSurfaceAlpha; | 2776 return BlitNtoNSurfaceAlpha; |
2207 } | 2777 } |
2230 return BlitARGBto555PixelAlpha; | 2800 return BlitARGBto555PixelAlpha; |
2231 } | 2801 } |
2232 return BlitNtoNPixelAlpha; | 2802 return BlitNtoNPixelAlpha; |
2233 | 2803 |
2234 case 4: | 2804 case 4: |
2235 if(sf->Amask == 0xff000000 | 2805 if(sf->Rmask == df->Rmask |
2236 && sf->Rmask == df->Rmask | |
2237 && sf->Gmask == df->Gmask | 2806 && sf->Gmask == df->Gmask |
2238 && sf->Bmask == df->Bmask | 2807 && sf->Bmask == df->Bmask |
2239 && sf->BytesPerPixel == 4) | 2808 && sf->BytesPerPixel == 4) |
2240 { | 2809 { |
2241 #if MMX_ASMBLIT | 2810 #if MMX_ASMBLIT |
2242 if(SDL_Has3DNow()) | 2811 if(sf->Rshift % 8 == 0 |
2243 return BlitRGBtoRGBPixelAlphaMMX3DNOW; | 2812 && sf->Gshift % 8 == 0 |
2244 else | 2813 && sf->Bshift % 8 == 0 |
2245 if(SDL_HasMMX()) | 2814 && sf->Ashift % 8 == 0 |
2246 return BlitRGBtoRGBPixelAlphaMMX; | 2815 && sf->Aloss == 0) |
2247 else | 2816 { |
2817 if(SDL_Has3DNow()) | |
2818 return BlitRGBtoRGBPixelAlphaMMX3DNOW; | |
2819 if(SDL_HasMMX()) | |
2820 return BlitRGBtoRGBPixelAlphaMMX; | |
2821 } | |
2248 #endif | 2822 #endif |
2249 #if SDL_ALTIVEC_BLITTERS | 2823 if(sf->Amask == 0xff000000) |
2250 if(!(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) | 2824 { |
2251 return BlitRGBtoRGBPixelAlphaAltivec; | 2825 #if USE_ALTIVEC_BLITTERS |
2252 else | 2826 if(SDL_HasAltiVec()) |
2827 return BlitRGBtoRGBPixelAlphaAltivec; | |
2253 #endif | 2828 #endif |
2254 return BlitRGBtoRGBPixelAlpha; | 2829 return BlitRGBtoRGBPixelAlpha; |
2830 } | |
2255 } | 2831 } |
2256 #if SDL_ALTIVEC_BLITTERS | 2832 #if SDL_ALTIVEC_BLITTERS |
2257 if (sf->Amask && sf->BytesPerPixel == 4 && | 2833 if (sf->Amask && sf->BytesPerPixel == 4 && |
2258 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) | 2834 !(surface->map->dst->flags & SDL_HWSURFACE) && SDL_HasAltiVec()) |
2259 return Blit32to32PixelAlphaAltivec; | 2835 return Blit32to32PixelAlphaAltivec; |
2260 else | 2836 else |
2261 #endif | 2837 #endif |
2262 return BlitNtoNPixelAlpha; | 2838 return BlitNtoNPixelAlpha; |
2263 | 2839 |
2264 case 3: | 2840 case 3: |
2265 default: | 2841 default: |
2266 return BlitNtoNPixelAlpha; | 2842 return BlitNtoNPixelAlpha; |
2267 } | 2843 } |