Mercurial > sdl-ios-xcode
comparison src/video/SDL_RLEaccel.c @ 3035:ff602fdfdedc
Removed Rafal Bursig's MMX RLE code, at his request.
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Tue, 13 Jan 2009 07:20:55 +0000 |
parents | 99210400e8b9 |
children | dc1eb82ffdaa |
comparison
equal
deleted
inserted
replaced
3034:0e821769fc51 | 3035:ff602fdfdedc |
---|---|
89 #include "SDL_video.h" | 89 #include "SDL_video.h" |
90 #include "SDL_sysvideo.h" | 90 #include "SDL_sysvideo.h" |
91 #include "SDL_blit.h" | 91 #include "SDL_blit.h" |
92 #include "SDL_RLEaccel_c.h" | 92 #include "SDL_RLEaccel_c.h" |
93 | 93 |
94 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES | |
95 #define MMX_ASMBLIT | |
96 #endif | |
97 | |
98 #ifdef MMX_ASMBLIT | |
99 #include "mmx.h" | |
100 #include "SDL_cpuinfo.h" | |
101 #endif | |
102 | |
103 #ifndef MAX | 94 #ifndef MAX |
104 #define MAX(a, b) ((a) > (b) ? (a) : (b)) | 95 #define MAX(a, b) ((a) > (b) ? (a) : (b)) |
105 #endif | 96 #endif |
106 #ifndef MIN | 97 #ifndef MIN |
107 #define MIN(a, b) ((a) < (b) ? (a) : (b)) | 98 #define MIN(a, b) ((a) < (b) ? (a) : (b)) |
120 * Various colorkey blit methods, for opaque and per-surface alpha | 111 * Various colorkey blit methods, for opaque and per-surface alpha |
121 */ | 112 */ |
122 | 113 |
123 #define OPAQUE_BLIT(to, from, length, bpp, alpha) \ | 114 #define OPAQUE_BLIT(to, from, length, bpp, alpha) \ |
124 PIXEL_COPY(to, from, length, bpp) | 115 PIXEL_COPY(to, from, length, bpp) |
125 | |
126 #ifdef MMX_ASMBLIT | |
127 | |
128 #define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha) \ | |
129 do { \ | |
130 Uint32 *srcp = (Uint32 *)(from); \ | |
131 Uint32 *dstp = (Uint32 *)(to); \ | |
132 int i = 0x00FF00FF; \ | |
133 movd_m2r(*(&i), mm3); \ | |
134 punpckldq_r2r(mm3, mm3); \ | |
135 i = 0xFF000000; \ | |
136 movd_m2r(*(&i), mm7); \ | |
137 punpckldq_r2r(mm7, mm7); \ | |
138 i = alpha | alpha << 16; \ | |
139 movd_m2r(*(&i), mm4); \ | |
140 punpckldq_r2r(mm4, mm4); \ | |
141 pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */ \ | |
142 pxor_r2r(mm7, mm5); /* make clear alpha mask */ \ | |
143 i = length; \ | |
144 if(i & 1) { \ | |
145 movd_m2r((*srcp), mm1); /* src -> mm1 */ \ | |
146 punpcklbw_r2r(mm1, mm1); \ | |
147 pand_r2r(mm3, mm1); \ | |
148 movd_m2r((*dstp), mm2); /* dst -> mm2 */ \ | |
149 punpcklbw_r2r(mm2, mm2); \ | |
150 pand_r2r(mm3, mm2); \ | |
151 psubw_r2r(mm2, mm1); \ | |
152 pmullw_r2r(mm4, mm1); \ | |
153 psrlw_i2r(8, mm1); \ | |
154 paddw_r2r(mm1, mm2); \ | |
155 pand_r2r(mm3, mm2); \ | |
156 packuswb_r2r(mm2, mm2); \ | |
157 pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */ \ | |
158 movd_r2m(mm2, *dstp); \ | |
159 ++srcp; \ | |
160 ++dstp; \ | |
161 i--; \ | |
162 } \ | |
163 for(; i > 0; --i) { \ | |
164 movq_m2r((*srcp), mm0); \ | |
165 movq_r2r(mm0, mm1); \ | |
166 punpcklbw_r2r(mm0, mm0); \ | |
167 movq_m2r((*dstp), mm2); \ | |
168 punpckhbw_r2r(mm1, mm1); \ | |
169 movq_r2r(mm2, mm6); \ | |
170 pand_r2r(mm3, mm0); \ | |
171 punpcklbw_r2r(mm2, mm2); \ | |
172 pand_r2r(mm3, mm1); \ | |
173 punpckhbw_r2r(mm6, mm6); \ | |
174 pand_r2r(mm3, mm2); \ | |
175 psubw_r2r(mm2, mm0); \ | |
176 pmullw_r2r(mm4, mm0); \ | |
177 pand_r2r(mm3, mm6); \ | |
178 psubw_r2r(mm6, mm1); \ | |
179 pmullw_r2r(mm4, mm1); \ | |
180 psrlw_i2r(8, mm0); \ | |
181 paddw_r2r(mm0, mm2); \ | |
182 psrlw_i2r(8, mm1); \ | |
183 paddw_r2r(mm1, mm6); \ | |
184 pand_r2r(mm3, mm2); \ | |
185 pand_r2r(mm3, mm6); \ | |
186 packuswb_r2r(mm2, mm2); \ | |
187 packuswb_r2r(mm6, mm6); \ | |
188 psrlq_i2r(32, mm2); \ | |
189 psllq_i2r(32, mm6); \ | |
190 por_r2r(mm6, mm2); \ | |
191 pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */ \ | |
192 movq_r2m(mm2, *dstp); \ | |
193 srcp += 2; \ | |
194 dstp += 2; \ | |
195 i--; \ | |
196 } \ | |
197 emms(); \ | |
198 } while(0) | |
199 | |
200 #define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha) \ | |
201 do { \ | |
202 int i, n = 0; \ | |
203 Uint16 *srcp = (Uint16 *)(from); \ | |
204 Uint16 *dstp = (Uint16 *)(to); \ | |
205 Uint32 ALPHA = 0xF800; \ | |
206 movd_m2r(*(&ALPHA), mm1); \ | |
207 punpcklwd_r2r(mm1, mm1); \ | |
208 punpcklwd_r2r(mm1, mm1); \ | |
209 ALPHA = 0x07E0; \ | |
210 movd_m2r(*(&ALPHA), mm4); \ | |
211 punpcklwd_r2r(mm4, mm4); \ | |
212 punpcklwd_r2r(mm4, mm4); \ | |
213 ALPHA = 0x001F; \ | |
214 movd_m2r(*(&ALPHA), mm7); \ | |
215 punpcklwd_r2r(mm7, mm7); \ | |
216 punpcklwd_r2r(mm7, mm7); \ | |
217 alpha &= ~(1+2+4); \ | |
218 i = (Uint32)alpha | (Uint32)alpha << 16; \ | |
219 movd_m2r(*(&i), mm0); \ | |
220 punpckldq_r2r(mm0, mm0); \ | |
221 ALPHA = alpha >> 3; \ | |
222 i = ((int)(length) & 3); \ | |
223 for(; i > 0; --i) { \ | |
224 Uint32 s = *srcp++; \ | |
225 Uint32 d = *dstp; \ | |
226 s = (s | s << 16) & 0x07e0f81f; \ | |
227 d = (d | d << 16) & 0x07e0f81f; \ | |
228 d += (s - d) * ALPHA >> 5; \ | |
229 d &= 0x07e0f81f; \ | |
230 *dstp++ = d | d >> 16; \ | |
231 n++; \ | |
232 } \ | |
233 i = (int)(length) - n; \ | |
234 for(; i > 0; --i) { \ | |
235 movq_m2r((*dstp), mm3); \ | |
236 movq_m2r((*srcp), mm2); \ | |
237 movq_r2r(mm2, mm5); \ | |
238 pand_r2r(mm1 , mm5); \ | |
239 psrlq_i2r(11, mm5); \ | |
240 movq_r2r(mm3, mm6); \ | |
241 pand_r2r(mm1 , mm6); \ | |
242 psrlq_i2r(11, mm6); \ | |
243 psubw_r2r(mm6, mm5); \ | |
244 pmullw_r2r(mm0, mm5); \ | |
245 psrlw_i2r(8, mm5); \ | |
246 paddw_r2r(mm5, mm6); \ | |
247 psllq_i2r(11, mm6); \ | |
248 pand_r2r(mm1, mm6); \ | |
249 movq_r2r(mm4, mm5); \ | |
250 por_r2r(mm7, mm5); \ | |
251 pand_r2r(mm5, mm3); \ | |
252 por_r2r(mm6, mm3); \ | |
253 movq_r2r(mm2, mm5); \ | |
254 pand_r2r(mm4 , mm5); \ | |
255 psrlq_i2r(5, mm5); \ | |
256 movq_r2r(mm3, mm6); \ | |
257 pand_r2r(mm4 , mm6); \ | |
258 psrlq_i2r(5, mm6); \ | |
259 psubw_r2r(mm6, mm5); \ | |
260 pmullw_r2r(mm0, mm5); \ | |
261 psrlw_i2r(8, mm5); \ | |
262 paddw_r2r(mm5, mm6); \ | |
263 psllq_i2r(5, mm6); \ | |
264 pand_r2r(mm4, mm6); \ | |
265 movq_r2r(mm1, mm5); \ | |
266 por_r2r(mm7, mm5); \ | |
267 pand_r2r(mm5, mm3); \ | |
268 por_r2r(mm6, mm3); \ | |
269 movq_r2r(mm2, mm5); \ | |
270 pand_r2r(mm7 , mm5); \ | |
271 movq_r2r(mm3, mm6); \ | |
272 pand_r2r(mm7 , mm6); \ | |
273 psubw_r2r(mm6, mm5); \ | |
274 pmullw_r2r(mm0, mm5); \ | |
275 psrlw_i2r(8, mm5); \ | |
276 paddw_r2r(mm5, mm6); \ | |
277 pand_r2r(mm7, mm6); \ | |
278 movq_r2r(mm1, mm5); \ | |
279 por_r2r(mm4, mm5); \ | |
280 pand_r2r(mm5, mm3); \ | |
281 por_r2r(mm6, mm3); \ | |
282 movq_r2m(mm3, *dstp); \ | |
283 srcp += 4; \ | |
284 dstp += 4; \ | |
285 i -= 3; \ | |
286 } \ | |
287 emms(); \ | |
288 } while(0) | |
289 | |
290 #define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha) \ | |
291 do { \ | |
292 int i, n = 0; \ | |
293 Uint16 *srcp = (Uint16 *)(from); \ | |
294 Uint16 *dstp = (Uint16 *)(to); \ | |
295 Uint32 ALPHA = 0x7C00; \ | |
296 movd_m2r(*(&ALPHA), mm1); \ | |
297 punpcklwd_r2r(mm1, mm1); \ | |
298 punpcklwd_r2r(mm1, mm1); \ | |
299 ALPHA = 0x03E0; \ | |
300 movd_m2r(*(&ALPHA), mm4); \ | |
301 punpcklwd_r2r(mm4, mm4); \ | |
302 punpcklwd_r2r(mm4, mm4); \ | |
303 ALPHA = 0x001F; \ | |
304 movd_m2r(*(&ALPHA), mm7); \ | |
305 punpcklwd_r2r(mm7, mm7); \ | |
306 punpcklwd_r2r(mm7, mm7); \ | |
307 alpha &= ~(1+2+4); \ | |
308 i = (Uint32)alpha | (Uint32)alpha << 16; \ | |
309 movd_m2r(*(&i), mm0); \ | |
310 punpckldq_r2r(mm0, mm0); \ | |
311 i = ((int)(length) & 3); \ | |
312 ALPHA = alpha >> 3; \ | |
313 for(; i > 0; --i) { \ | |
314 Uint32 s = *srcp++; \ | |
315 Uint32 d = *dstp; \ | |
316 s = (s | s << 16) & 0x03e07c1f; \ | |
317 d = (d | d << 16) & 0x03e07c1f; \ | |
318 d += (s - d) * ALPHA >> 5; \ | |
319 d &= 0x03e07c1f; \ | |
320 *dstp++ = d | d >> 16; \ | |
321 n++; \ | |
322 } \ | |
323 i = (int)(length) - n; \ | |
324 for(; i > 0; --i) { \ | |
325 movq_m2r((*dstp), mm3); \ | |
326 movq_m2r((*srcp), mm2); \ | |
327 movq_r2r(mm2, mm5); \ | |
328 pand_r2r(mm1 , mm5); \ | |
329 psrlq_i2r(10, mm5); \ | |
330 movq_r2r(mm3, mm6); \ | |
331 pand_r2r(mm1 , mm6); \ | |
332 psrlq_i2r(10, mm6); \ | |
333 psubw_r2r(mm6, mm5); \ | |
334 pmullw_r2r(mm0, mm5); \ | |
335 psrlw_i2r(8, mm5); \ | |
336 paddw_r2r(mm5, mm6); \ | |
337 psllq_i2r(10, mm6); \ | |
338 pand_r2r(mm1, mm6); \ | |
339 movq_r2r(mm4, mm5); \ | |
340 por_r2r(mm7, mm5); \ | |
341 pand_r2r(mm5, mm3); \ | |
342 por_r2r(mm6, mm3); \ | |
343 movq_r2r(mm2, mm5); \ | |
344 pand_r2r(mm4 , mm5); \ | |
345 psrlq_i2r(5, mm5); \ | |
346 movq_r2r(mm3, mm6); \ | |
347 pand_r2r(mm4 , mm6); \ | |
348 psrlq_i2r(5, mm6); \ | |
349 psubw_r2r(mm6, mm5); \ | |
350 pmullw_r2r(mm0, mm5); \ | |
351 psrlw_i2r(8, mm5); \ | |
352 paddw_r2r(mm5, mm6); \ | |
353 psllq_i2r(5, mm6); \ | |
354 pand_r2r(mm4, mm6); \ | |
355 movq_r2r(mm1, mm5); \ | |
356 por_r2r(mm7, mm5); \ | |
357 pand_r2r(mm5, mm3); \ | |
358 por_r2r(mm6, mm3); \ | |
359 movq_r2r(mm2, mm5); \ | |
360 pand_r2r(mm7 , mm5); \ | |
361 movq_r2r(mm3, mm6); \ | |
362 pand_r2r(mm7 , mm6); \ | |
363 psubw_r2r(mm6, mm5); \ | |
364 pmullw_r2r(mm0, mm5); \ | |
365 psrlw_i2r(8, mm5); \ | |
366 paddw_r2r(mm5, mm6); \ | |
367 pand_r2r(mm7, mm6); \ | |
368 movq_r2r(mm1, mm5); \ | |
369 por_r2r(mm4, mm5); \ | |
370 pand_r2r(mm5, mm3); \ | |
371 por_r2r(mm6, mm3); \ | |
372 movq_r2m(mm3, *dstp); \ | |
373 srcp += 4; \ | |
374 dstp += 4; \ | |
375 i -= 3; \ | |
376 } \ | |
377 emms(); \ | |
378 } while(0) | |
379 | |
380 #endif | |
381 | 116 |
382 /* | 117 /* |
383 * For 32bpp pixels on the form 0x00rrggbb: | 118 * For 32bpp pixels on the form 0x00rrggbb: |
384 * If we treat the middle component separately, we can process the two | 119 * If we treat the middle component separately, we can process the two |
385 * remaining in parallel. This is safe to do because of the gap to the left | 120 * remaining in parallel. This is safe to do because of the gap to the left |
502 src += bpp; \ | 237 src += bpp; \ |
503 dst += bpp; \ | 238 dst += bpp; \ |
504 } \ | 239 } \ |
505 } while(0) | 240 } while(0) |
506 | 241 |
507 #ifdef MMX_ASMBLIT | |
508 | |
509 #define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha) \ | |
510 do { \ | |
511 Uint32 *srcp = (Uint32 *)(from); \ | |
512 Uint32 *dstp = (Uint32 *)(to); \ | |
513 int i = 0x00fefefe; \ | |
514 movd_m2r(*(&i), mm4); \ | |
515 punpckldq_r2r(mm4, mm4); \ | |
516 i = 0x00010101; \ | |
517 movd_m2r(*(&i), mm3); \ | |
518 punpckldq_r2r(mm3, mm3); \ | |
519 i = (int)(length); \ | |
520 if( i & 1 ) { \ | |
521 Uint32 s = *srcp++; \ | |
522 Uint32 d = *dstp; \ | |
523 *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1) \ | |
524 + (s & d & 0x00010101); \ | |
525 i--; \ | |
526 } \ | |
527 for(; i > 0; --i) { \ | |
528 movq_m2r((*dstp), mm2); /* dst -> mm2 */ \ | |
529 movq_r2r(mm2, mm6); /* dst -> mm6 */ \ | |
530 movq_m2r((*srcp), mm1); /* src -> mm1 */ \ | |
531 movq_r2r(mm1, mm5); /* src -> mm5 */ \ | |
532 pand_r2r(mm4, mm6); /* dst & 0x00fefefe -> mm6 */ \ | |
533 pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */ \ | |
534 paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */ \ | |
535 psrld_i2r(1, mm5); \ | |
536 pand_r2r(mm1, mm2); /* s & d -> mm2 */ \ | |
537 pand_r2r(mm3, mm2); /* s & d & 0x00010101 -> mm2 */ \ | |
538 paddd_r2r(mm5, mm2); \ | |
539 movq_r2m(mm2, (*dstp)); \ | |
540 dstp += 2; \ | |
541 srcp += 2; \ | |
542 i--; \ | |
543 } \ | |
544 emms(); \ | |
545 } while(0) | |
546 | |
547 #endif | |
548 | |
549 /* | 242 /* |
550 * Special case: 50% alpha (alpha=128) | 243 * Special case: 50% alpha (alpha=128) |
551 * This is treated specially because it can be optimized very well, and | 244 * This is treated specially because it can be optimized very well, and |
552 * since it is good for many cases of semi-translucency. | 245 * since it is good for many cases of semi-translucency. |
553 * The theory is to do all three components at the same time: | 246 * The theory is to do all three components at the same time: |
614 #define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha) \ | 307 #define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha) \ |
615 ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de) | 308 ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de) |
616 | 309 |
617 #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha) \ | 310 #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha) \ |
618 ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde) | 311 ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde) |
619 | |
620 #ifdef MMX_ASMBLIT | |
621 | |
622 #define CHOOSE_BLIT(blitter, alpha, fmt) \ | |
623 do { \ | |
624 if(alpha == 255) { \ | |
625 switch(fmt->BytesPerPixel) { \ | |
626 case 1: blitter(1, Uint8, OPAQUE_BLIT); break; \ | |
627 case 2: blitter(2, Uint8, OPAQUE_BLIT); break; \ | |
628 case 3: blitter(3, Uint8, OPAQUE_BLIT); break; \ | |
629 case 4: blitter(4, Uint16, OPAQUE_BLIT); break; \ | |
630 } \ | |
631 } else { \ | |
632 switch(fmt->BytesPerPixel) { \ | |
633 case 1: \ | |
634 /* No 8bpp alpha blitting */ \ | |
635 break; \ | |
636 \ | |
637 case 2: \ | |
638 switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) { \ | |
639 case 0xffff: \ | |
640 if(fmt->Gmask == 0x07e0 \ | |
641 || fmt->Rmask == 0x07e0 \ | |
642 || fmt->Bmask == 0x07e0) { \ | |
643 if(alpha == 128) \ | |
644 blitter(2, Uint8, ALPHA_BLIT16_565_50); \ | |
645 else { \ | |
646 if(SDL_HasMMX()) \ | |
647 blitter(2, Uint8, ALPHA_BLIT16_565MMX); \ | |
648 else \ | |
649 blitter(2, Uint8, ALPHA_BLIT16_565); \ | |
650 } \ | |
651 } else \ | |
652 goto general16; \ | |
653 break; \ | |
654 \ | |
655 case 0x7fff: \ | |
656 if(fmt->Gmask == 0x03e0 \ | |
657 || fmt->Rmask == 0x03e0 \ | |
658 || fmt->Bmask == 0x03e0) { \ | |
659 if(alpha == 128) \ | |
660 blitter(2, Uint8, ALPHA_BLIT16_555_50); \ | |
661 else { \ | |
662 if(SDL_HasMMX()) \ | |
663 blitter(2, Uint8, ALPHA_BLIT16_555MMX); \ | |
664 else \ | |
665 blitter(2, Uint8, ALPHA_BLIT16_555); \ | |
666 } \ | |
667 break; \ | |
668 } \ | |
669 /* fallthrough */ \ | |
670 \ | |
671 default: \ | |
672 general16: \ | |
673 blitter(2, Uint8, ALPHA_BLIT_ANY); \ | |
674 } \ | |
675 break; \ | |
676 \ | |
677 case 3: \ | |
678 blitter(3, Uint8, ALPHA_BLIT_ANY); \ | |
679 break; \ | |
680 \ | |
681 case 4: \ | |
682 if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff \ | |
683 && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00 \ | |
684 || fmt->Bmask == 0xff00)) { \ | |
685 if(alpha == 128) \ | |
686 { \ | |
687 if(SDL_HasMMX()) \ | |
688 blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\ | |
689 else \ | |
690 blitter(4, Uint16, ALPHA_BLIT32_888_50);\ | |
691 } \ | |
692 else \ | |
693 { \ | |
694 if(SDL_HasMMX()) \ | |
695 blitter(4, Uint16, ALPHA_BLIT32_888MMX);\ | |
696 else \ | |
697 blitter(4, Uint16, ALPHA_BLIT32_888); \ | |
698 } \ | |
699 } else \ | |
700 blitter(4, Uint16, ALPHA_BLIT_ANY); \ | |
701 break; \ | |
702 } \ | |
703 } \ | |
704 } while(0) | |
705 | |
706 #else | |
707 | 312 |
708 #define CHOOSE_BLIT(blitter, alpha, fmt) \ | 313 #define CHOOSE_BLIT(blitter, alpha, fmt) \ |
709 do { \ | 314 do { \ |
710 if(alpha == 255) { \ | 315 if(alpha == 255) { \ |
711 switch(fmt->BytesPerPixel) { \ | 316 switch(fmt->BytesPerPixel) { \ |
770 blitter(4, Uint16, ALPHA_BLIT_ANY); \ | 375 blitter(4, Uint16, ALPHA_BLIT_ANY); \ |
771 break; \ | 376 break; \ |
772 } \ | 377 } \ |
773 } \ | 378 } \ |
774 } while(0) | 379 } while(0) |
775 | |
776 #endif | |
777 | 380 |
778 /* | 381 /* |
779 * This takes care of the case when the surface is clipped on the left and/or | 382 * This takes care of the case when the surface is clipped on the left and/or |
780 * right. Top clipping has already been taken care of. | 383 * right. Top clipping has already been taken care of. |
781 */ | 384 */ |