comparison src/video/SDL_RLEaccel.c @ 3035:ff602fdfdedc

Removed Rafal Bursig's MMX RLE code, at his request.
author Sam Lantinga <slouken@libsdl.org>
date Tue, 13 Jan 2009 07:20:55 +0000
parents 99210400e8b9
children dc1eb82ffdaa
comparison
equal deleted inserted replaced
3034:0e821769fc51 3035:ff602fdfdedc
89 #include "SDL_video.h" 89 #include "SDL_video.h"
90 #include "SDL_sysvideo.h" 90 #include "SDL_sysvideo.h"
91 #include "SDL_blit.h" 91 #include "SDL_blit.h"
92 #include "SDL_RLEaccel_c.h" 92 #include "SDL_RLEaccel_c.h"
93 93
94 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
95 #define MMX_ASMBLIT
96 #endif
97
98 #ifdef MMX_ASMBLIT
99 #include "mmx.h"
100 #include "SDL_cpuinfo.h"
101 #endif
102
103 #ifndef MAX 94 #ifndef MAX
104 #define MAX(a, b) ((a) > (b) ? (a) : (b)) 95 #define MAX(a, b) ((a) > (b) ? (a) : (b))
105 #endif 96 #endif
106 #ifndef MIN 97 #ifndef MIN
107 #define MIN(a, b) ((a) < (b) ? (a) : (b)) 98 #define MIN(a, b) ((a) < (b) ? (a) : (b))
120 * Various colorkey blit methods, for opaque and per-surface alpha 111 * Various colorkey blit methods, for opaque and per-surface alpha
121 */ 112 */
122 113
123 #define OPAQUE_BLIT(to, from, length, bpp, alpha) \ 114 #define OPAQUE_BLIT(to, from, length, bpp, alpha) \
124 PIXEL_COPY(to, from, length, bpp) 115 PIXEL_COPY(to, from, length, bpp)
125
126 #ifdef MMX_ASMBLIT
127
128 #define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha) \
129 do { \
130 Uint32 *srcp = (Uint32 *)(from); \
131 Uint32 *dstp = (Uint32 *)(to); \
132 int i = 0x00FF00FF; \
133 movd_m2r(*(&i), mm3); \
134 punpckldq_r2r(mm3, mm3); \
135 i = 0xFF000000; \
136 movd_m2r(*(&i), mm7); \
137 punpckldq_r2r(mm7, mm7); \
138 i = alpha | alpha << 16; \
139 movd_m2r(*(&i), mm4); \
140 punpckldq_r2r(mm4, mm4); \
141 pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */ \
142 pxor_r2r(mm7, mm5); /* make clear alpha mask */ \
143 i = length; \
144 if(i & 1) { \
145 movd_m2r((*srcp), mm1); /* src -> mm1 */ \
146 punpcklbw_r2r(mm1, mm1); \
147 pand_r2r(mm3, mm1); \
148 movd_m2r((*dstp), mm2); /* dst -> mm2 */ \
149 punpcklbw_r2r(mm2, mm2); \
150 pand_r2r(mm3, mm2); \
151 psubw_r2r(mm2, mm1); \
152 pmullw_r2r(mm4, mm1); \
153 psrlw_i2r(8, mm1); \
154 paddw_r2r(mm1, mm2); \
155 pand_r2r(mm3, mm2); \
156 packuswb_r2r(mm2, mm2); \
157 pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */ \
158 movd_r2m(mm2, *dstp); \
159 ++srcp; \
160 ++dstp; \
161 i--; \
162 } \
163 for(; i > 0; --i) { \
164 movq_m2r((*srcp), mm0); \
165 movq_r2r(mm0, mm1); \
166 punpcklbw_r2r(mm0, mm0); \
167 movq_m2r((*dstp), mm2); \
168 punpckhbw_r2r(mm1, mm1); \
169 movq_r2r(mm2, mm6); \
170 pand_r2r(mm3, mm0); \
171 punpcklbw_r2r(mm2, mm2); \
172 pand_r2r(mm3, mm1); \
173 punpckhbw_r2r(mm6, mm6); \
174 pand_r2r(mm3, mm2); \
175 psubw_r2r(mm2, mm0); \
176 pmullw_r2r(mm4, mm0); \
177 pand_r2r(mm3, mm6); \
178 psubw_r2r(mm6, mm1); \
179 pmullw_r2r(mm4, mm1); \
180 psrlw_i2r(8, mm0); \
181 paddw_r2r(mm0, mm2); \
182 psrlw_i2r(8, mm1); \
183 paddw_r2r(mm1, mm6); \
184 pand_r2r(mm3, mm2); \
185 pand_r2r(mm3, mm6); \
186 packuswb_r2r(mm2, mm2); \
187 packuswb_r2r(mm6, mm6); \
188 psrlq_i2r(32, mm2); \
189 psllq_i2r(32, mm6); \
190 por_r2r(mm6, mm2); \
191 pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */ \
192 movq_r2m(mm2, *dstp); \
193 srcp += 2; \
194 dstp += 2; \
195 i--; \
196 } \
197 emms(); \
198 } while(0)
199
200 #define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha) \
201 do { \
202 int i, n = 0; \
203 Uint16 *srcp = (Uint16 *)(from); \
204 Uint16 *dstp = (Uint16 *)(to); \
205 Uint32 ALPHA = 0xF800; \
206 movd_m2r(*(&ALPHA), mm1); \
207 punpcklwd_r2r(mm1, mm1); \
208 punpcklwd_r2r(mm1, mm1); \
209 ALPHA = 0x07E0; \
210 movd_m2r(*(&ALPHA), mm4); \
211 punpcklwd_r2r(mm4, mm4); \
212 punpcklwd_r2r(mm4, mm4); \
213 ALPHA = 0x001F; \
214 movd_m2r(*(&ALPHA), mm7); \
215 punpcklwd_r2r(mm7, mm7); \
216 punpcklwd_r2r(mm7, mm7); \
217 alpha &= ~(1+2+4); \
218 i = (Uint32)alpha | (Uint32)alpha << 16; \
219 movd_m2r(*(&i), mm0); \
220 punpckldq_r2r(mm0, mm0); \
221 ALPHA = alpha >> 3; \
222 i = ((int)(length) & 3); \
223 for(; i > 0; --i) { \
224 Uint32 s = *srcp++; \
225 Uint32 d = *dstp; \
226 s = (s | s << 16) & 0x07e0f81f; \
227 d = (d | d << 16) & 0x07e0f81f; \
228 d += (s - d) * ALPHA >> 5; \
229 d &= 0x07e0f81f; \
230 *dstp++ = d | d >> 16; \
231 n++; \
232 } \
233 i = (int)(length) - n; \
234 for(; i > 0; --i) { \
235 movq_m2r((*dstp), mm3); \
236 movq_m2r((*srcp), mm2); \
237 movq_r2r(mm2, mm5); \
238 pand_r2r(mm1 , mm5); \
239 psrlq_i2r(11, mm5); \
240 movq_r2r(mm3, mm6); \
241 pand_r2r(mm1 , mm6); \
242 psrlq_i2r(11, mm6); \
243 psubw_r2r(mm6, mm5); \
244 pmullw_r2r(mm0, mm5); \
245 psrlw_i2r(8, mm5); \
246 paddw_r2r(mm5, mm6); \
247 psllq_i2r(11, mm6); \
248 pand_r2r(mm1, mm6); \
249 movq_r2r(mm4, mm5); \
250 por_r2r(mm7, mm5); \
251 pand_r2r(mm5, mm3); \
252 por_r2r(mm6, mm3); \
253 movq_r2r(mm2, mm5); \
254 pand_r2r(mm4 , mm5); \
255 psrlq_i2r(5, mm5); \
256 movq_r2r(mm3, mm6); \
257 pand_r2r(mm4 , mm6); \
258 psrlq_i2r(5, mm6); \
259 psubw_r2r(mm6, mm5); \
260 pmullw_r2r(mm0, mm5); \
261 psrlw_i2r(8, mm5); \
262 paddw_r2r(mm5, mm6); \
263 psllq_i2r(5, mm6); \
264 pand_r2r(mm4, mm6); \
265 movq_r2r(mm1, mm5); \
266 por_r2r(mm7, mm5); \
267 pand_r2r(mm5, mm3); \
268 por_r2r(mm6, mm3); \
269 movq_r2r(mm2, mm5); \
270 pand_r2r(mm7 , mm5); \
271 movq_r2r(mm3, mm6); \
272 pand_r2r(mm7 , mm6); \
273 psubw_r2r(mm6, mm5); \
274 pmullw_r2r(mm0, mm5); \
275 psrlw_i2r(8, mm5); \
276 paddw_r2r(mm5, mm6); \
277 pand_r2r(mm7, mm6); \
278 movq_r2r(mm1, mm5); \
279 por_r2r(mm4, mm5); \
280 pand_r2r(mm5, mm3); \
281 por_r2r(mm6, mm3); \
282 movq_r2m(mm3, *dstp); \
283 srcp += 4; \
284 dstp += 4; \
285 i -= 3; \
286 } \
287 emms(); \
288 } while(0)
289
290 #define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha) \
291 do { \
292 int i, n = 0; \
293 Uint16 *srcp = (Uint16 *)(from); \
294 Uint16 *dstp = (Uint16 *)(to); \
295 Uint32 ALPHA = 0x7C00; \
296 movd_m2r(*(&ALPHA), mm1); \
297 punpcklwd_r2r(mm1, mm1); \
298 punpcklwd_r2r(mm1, mm1); \
299 ALPHA = 0x03E0; \
300 movd_m2r(*(&ALPHA), mm4); \
301 punpcklwd_r2r(mm4, mm4); \
302 punpcklwd_r2r(mm4, mm4); \
303 ALPHA = 0x001F; \
304 movd_m2r(*(&ALPHA), mm7); \
305 punpcklwd_r2r(mm7, mm7); \
306 punpcklwd_r2r(mm7, mm7); \
307 alpha &= ~(1+2+4); \
308 i = (Uint32)alpha | (Uint32)alpha << 16; \
309 movd_m2r(*(&i), mm0); \
310 punpckldq_r2r(mm0, mm0); \
311 i = ((int)(length) & 3); \
312 ALPHA = alpha >> 3; \
313 for(; i > 0; --i) { \
314 Uint32 s = *srcp++; \
315 Uint32 d = *dstp; \
316 s = (s | s << 16) & 0x03e07c1f; \
317 d = (d | d << 16) & 0x03e07c1f; \
318 d += (s - d) * ALPHA >> 5; \
319 d &= 0x03e07c1f; \
320 *dstp++ = d | d >> 16; \
321 n++; \
322 } \
323 i = (int)(length) - n; \
324 for(; i > 0; --i) { \
325 movq_m2r((*dstp), mm3); \
326 movq_m2r((*srcp), mm2); \
327 movq_r2r(mm2, mm5); \
328 pand_r2r(mm1 , mm5); \
329 psrlq_i2r(10, mm5); \
330 movq_r2r(mm3, mm6); \
331 pand_r2r(mm1 , mm6); \
332 psrlq_i2r(10, mm6); \
333 psubw_r2r(mm6, mm5); \
334 pmullw_r2r(mm0, mm5); \
335 psrlw_i2r(8, mm5); \
336 paddw_r2r(mm5, mm6); \
337 psllq_i2r(10, mm6); \
338 pand_r2r(mm1, mm6); \
339 movq_r2r(mm4, mm5); \
340 por_r2r(mm7, mm5); \
341 pand_r2r(mm5, mm3); \
342 por_r2r(mm6, mm3); \
343 movq_r2r(mm2, mm5); \
344 pand_r2r(mm4 , mm5); \
345 psrlq_i2r(5, mm5); \
346 movq_r2r(mm3, mm6); \
347 pand_r2r(mm4 , mm6); \
348 psrlq_i2r(5, mm6); \
349 psubw_r2r(mm6, mm5); \
350 pmullw_r2r(mm0, mm5); \
351 psrlw_i2r(8, mm5); \
352 paddw_r2r(mm5, mm6); \
353 psllq_i2r(5, mm6); \
354 pand_r2r(mm4, mm6); \
355 movq_r2r(mm1, mm5); \
356 por_r2r(mm7, mm5); \
357 pand_r2r(mm5, mm3); \
358 por_r2r(mm6, mm3); \
359 movq_r2r(mm2, mm5); \
360 pand_r2r(mm7 , mm5); \
361 movq_r2r(mm3, mm6); \
362 pand_r2r(mm7 , mm6); \
363 psubw_r2r(mm6, mm5); \
364 pmullw_r2r(mm0, mm5); \
365 psrlw_i2r(8, mm5); \
366 paddw_r2r(mm5, mm6); \
367 pand_r2r(mm7, mm6); \
368 movq_r2r(mm1, mm5); \
369 por_r2r(mm4, mm5); \
370 pand_r2r(mm5, mm3); \
371 por_r2r(mm6, mm3); \
372 movq_r2m(mm3, *dstp); \
373 srcp += 4; \
374 dstp += 4; \
375 i -= 3; \
376 } \
377 emms(); \
378 } while(0)
379
380 #endif
381 116
382 /* 117 /*
383 * For 32bpp pixels on the form 0x00rrggbb: 118 * For 32bpp pixels on the form 0x00rrggbb:
384 * If we treat the middle component separately, we can process the two 119 * If we treat the middle component separately, we can process the two
385 * remaining in parallel. This is safe to do because of the gap to the left 120 * remaining in parallel. This is safe to do because of the gap to the left
502 src += bpp; \ 237 src += bpp; \
503 dst += bpp; \ 238 dst += bpp; \
504 } \ 239 } \
505 } while(0) 240 } while(0)
506 241
507 #ifdef MMX_ASMBLIT
508
509 #define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha) \
510 do { \
511 Uint32 *srcp = (Uint32 *)(from); \
512 Uint32 *dstp = (Uint32 *)(to); \
513 int i = 0x00fefefe; \
514 movd_m2r(*(&i), mm4); \
515 punpckldq_r2r(mm4, mm4); \
516 i = 0x00010101; \
517 movd_m2r(*(&i), mm3); \
518 punpckldq_r2r(mm3, mm3); \
519 i = (int)(length); \
520 if( i & 1 ) { \
521 Uint32 s = *srcp++; \
522 Uint32 d = *dstp; \
523 *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) \
524 + (s & d & 0x00010101); \
525 i--; \
526 } \
527 for(; i > 0; --i) { \
528 movq_m2r((*dstp), mm2); /* dst -> mm2 */ \
529 movq_r2r(mm2, mm6); /* dst -> mm6 */ \
530 movq_m2r((*srcp), mm1); /* src -> mm1 */ \
531 movq_r2r(mm1, mm5); /* src -> mm5 */ \
532 pand_r2r(mm4, mm6); /* dst & 0x00fefefe -> mm6 */ \
533 pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */ \
534 paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */ \
535 psrld_i2r(1, mm5); \
536 pand_r2r(mm1, mm2); /* s & d -> mm2 */ \
537 pand_r2r(mm3, mm2); /* s & d & 0x00010101 -> mm2 */ \
538 paddd_r2r(mm5, mm2); \
539 movq_r2m(mm2, (*dstp)); \
540 dstp += 2; \
541 srcp += 2; \
542 i--; \
543 } \
544 emms(); \
545 } while(0)
546
547 #endif
548
549 /* 242 /*
550 * Special case: 50% alpha (alpha=128) 243 * Special case: 50% alpha (alpha=128)
551 * This is treated specially because it can be optimized very well, and 244 * This is treated specially because it can be optimized very well, and
552 * since it is good for many cases of semi-translucency. 245 * since it is good for many cases of semi-translucency.
553 * The theory is to do all three components at the same time: 246 * The theory is to do all three components at the same time:
614 #define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha) \ 307 #define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha) \
615 ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de) 308 ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de)
616 309
617 #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha) \ 310 #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha) \
618 ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde) 311 ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde)
619
620 #ifdef MMX_ASMBLIT
621
622 #define CHOOSE_BLIT(blitter, alpha, fmt) \
623 do { \
624 if(alpha == 255) { \
625 switch(fmt->BytesPerPixel) { \
626 case 1: blitter(1, Uint8, OPAQUE_BLIT); break; \
627 case 2: blitter(2, Uint8, OPAQUE_BLIT); break; \
628 case 3: blitter(3, Uint8, OPAQUE_BLIT); break; \
629 case 4: blitter(4, Uint16, OPAQUE_BLIT); break; \
630 } \
631 } else { \
632 switch(fmt->BytesPerPixel) { \
633 case 1: \
634 /* No 8bpp alpha blitting */ \
635 break; \
636 \
637 case 2: \
638 switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) { \
639 case 0xffff: \
640 if(fmt->Gmask == 0x07e0 \
641 || fmt->Rmask == 0x07e0 \
642 || fmt->Bmask == 0x07e0) { \
643 if(alpha == 128) \
644 blitter(2, Uint8, ALPHA_BLIT16_565_50); \
645 else { \
646 if(SDL_HasMMX()) \
647 blitter(2, Uint8, ALPHA_BLIT16_565MMX); \
648 else \
649 blitter(2, Uint8, ALPHA_BLIT16_565); \
650 } \
651 } else \
652 goto general16; \
653 break; \
654 \
655 case 0x7fff: \
656 if(fmt->Gmask == 0x03e0 \
657 || fmt->Rmask == 0x03e0 \
658 || fmt->Bmask == 0x03e0) { \
659 if(alpha == 128) \
660 blitter(2, Uint8, ALPHA_BLIT16_555_50); \
661 else { \
662 if(SDL_HasMMX()) \
663 blitter(2, Uint8, ALPHA_BLIT16_555MMX); \
664 else \
665 blitter(2, Uint8, ALPHA_BLIT16_555); \
666 } \
667 break; \
668 } \
669 /* fallthrough */ \
670 \
671 default: \
672 general16: \
673 blitter(2, Uint8, ALPHA_BLIT_ANY); \
674 } \
675 break; \
676 \
677 case 3: \
678 blitter(3, Uint8, ALPHA_BLIT_ANY); \
679 break; \
680 \
681 case 4: \
682 if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff \
683 && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00 \
684 || fmt->Bmask == 0xff00)) { \
685 if(alpha == 128) \
686 { \
687 if(SDL_HasMMX()) \
688 blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\
689 else \
690 blitter(4, Uint16, ALPHA_BLIT32_888_50);\
691 } \
692 else \
693 { \
694 if(SDL_HasMMX()) \
695 blitter(4, Uint16, ALPHA_BLIT32_888MMX);\
696 else \
697 blitter(4, Uint16, ALPHA_BLIT32_888); \
698 } \
699 } else \
700 blitter(4, Uint16, ALPHA_BLIT_ANY); \
701 break; \
702 } \
703 } \
704 } while(0)
705
706 #else
707 312
708 #define CHOOSE_BLIT(blitter, alpha, fmt) \ 313 #define CHOOSE_BLIT(blitter, alpha, fmt) \
709 do { \ 314 do { \
710 if(alpha == 255) { \ 315 if(alpha == 255) { \
711 switch(fmt->BytesPerPixel) { \ 316 switch(fmt->BytesPerPixel) { \
770 blitter(4, Uint16, ALPHA_BLIT_ANY); \ 375 blitter(4, Uint16, ALPHA_BLIT_ANY); \
771 break; \ 376 break; \
772 } \ 377 } \
773 } \ 378 } \
774 } while(0) 379 } while(0)
775
776 #endif
777 380
778 /* 381 /*
779 * This takes care of the case when the surface is clipped on the left and/or 382 * This takes care of the case when the surface is clipped on the left and/or
780 * right. Top clipping has already been taken care of. 383 * right. Top clipping has already been taken care of.
781 */ 384 */