Mercurial > sdl-ios-xcode
comparison src/hermes/mmxp2_32.asm @ 1230:88c2d6aed428
From Mike Frysinger and/or Gentoo:
- libsdl-PIC-load-mmx-masks-from-stack.patch
this one may be a little controversial ... the fix here is again that you cant
reference the memory addresses like this to load into a mmx register, so the
way to do it is to push two 32bit words onto the stack, load the 64bit value
off of the stack into the mmx register, and then adjust the stack so that
it's back to normal.
author | Ryan C. Gordon <icculus@icculus.org> |
---|---|
date | Thu, 05 Jan 2006 15:25:19 +0000 |
parents | 2d6dc7de1145 |
children | 393092a3ebf6 |
comparison
equal
deleted
inserted
replaced
1229:1430f5fe092a | 1230:88c2d6aed428 |
---|---|
27 GLOBAL _ConvertMMXpII32_16RGB555 | 27 GLOBAL _ConvertMMXpII32_16RGB555 |
28 GLOBAL _ConvertMMXpII32_16BGR555 | 28 GLOBAL _ConvertMMXpII32_16BGR555 |
29 | 29 |
30 EXTERN _mmxreturn | 30 EXTERN _mmxreturn |
31 | 31 |
32 SECTION .data | 32 ;; Macros for conversion routines |
33 | 33 |
34 ALIGN 8 | 34 %macro _push_immq_mask 1 |
35 | 35 push dword %1 |
36 ;; Constants for conversion routines | 36 push dword %1 |
37 | 37 %endmacro |
38 mmx32_rgb888_mask dd 00ffffffh,00ffffffh | 38 |
39 | 39 %macro load_immq 2 |
40 mmx32_rgb565_b dd 000000f8h, 000000f8h | 40 _push_immq_mask %2 |
41 mmx32_rgb565_g dd 0000fc00h, 0000fc00h | 41 movq %1, [esp] |
42 mmx32_rgb565_r dd 00f80000h, 00f80000h | 42 %endmacro |
43 | 43 |
44 mmx32_rgb555_rb dd 00f800f8h,00f800f8h | 44 %macro pand_immq 2 |
45 mmx32_rgb555_g dd 0000f800h,0000f800h | 45 _push_immq_mask %2 |
46 mmx32_rgb555_mul dd 20000008h,20000008h | 46 pand %1, [esp] |
47 mmx32_bgr555_mul dd 00082000h,00082000h | 47 %endmacro |
48 | 48 |
49 | 49 %define CLEANUP_IMMQ_LOADS(num) \ |
50 | 50 add esp, byte 8 * num |
51 | |
52 %define mmx32_rgb888_mask 00ffffffh | |
53 %define mmx32_rgb565_b 000000f8h | |
54 %define mmx32_rgb565_g 0000fc00h | |
55 %define mmx32_rgb565_r 00f80000h | |
56 | |
57 %define mmx32_rgb555_rb 00f800f8h | |
58 %define mmx32_rgb555_g 0000f800h | |
59 %define mmx32_rgb555_mul 20000008h | |
60 %define mmx32_bgr555_mul 00082000h | |
61 | |
51 SECTION .text | 62 SECTION .text |
52 | 63 |
53 _ConvertMMXpII32_24RGB888: | 64 _ConvertMMXpII32_24RGB888: |
54 | 65 |
55 ; set up mm6 as the mask, mm7 as zero | 66 ; set up mm6 as the mask, mm7 as zero |
56 movq mm6, qword [mmx32_rgb888_mask] | 67 load_immq mm6, mmx32_rgb888_mask |
68 CLEANUP_IMMQ_LOADS(1) | |
57 pxor mm7, mm7 | 69 pxor mm7, mm7 |
58 | 70 |
59 mov edx, ecx ; save ecx | 71 mov edx, ecx ; save ecx |
60 and ecx, 0fffffffch ; clear lower two bits | 72 and ecx, 0fffffffch ; clear lower two bits |
61 jnz .L1 | 73 jnz .L1 |
113 | 125 |
114 | 126 |
115 _ConvertMMXpII32_16RGB565: | 127 _ConvertMMXpII32_16RGB565: |
116 | 128 |
117 ; set up masks | 129 ; set up masks |
118 movq mm5, [mmx32_rgb565_b] | 130 load_immq mm5, mmx32_rgb565_b |
119 movq mm6, [mmx32_rgb565_g] | 131 load_immq mm6, mmx32_rgb565_g |
120 movq mm7, [mmx32_rgb565_r] | 132 load_immq mm7, mmx32_rgb565_r |
133 CLEANUP_IMMQ_LOADS(3) | |
121 | 134 |
122 mov edx, ecx | 135 mov edx, ecx |
123 shr ecx, 2 | 136 shr ecx, 2 |
124 jnz .L1 | 137 jnz .L1 |
125 jmp .L2 ; not necessary at the moment, but doesn't hurt (much) | 138 jmp .L2 ; not necessary at the moment, but doesn't hurt (much) |
179 jmp _mmxreturn | 192 jmp _mmxreturn |
180 | 193 |
181 | 194 |
182 _ConvertMMXpII32_16BGR565: | 195 _ConvertMMXpII32_16BGR565: |
183 | 196 |
184 movq mm5, [mmx32_rgb565_r] | 197 load_immq mm5, mmx32_rgb565_r |
185 movq mm6, [mmx32_rgb565_g] | 198 load_immq mm6, mmx32_rgb565_g |
186 movq mm7, [mmx32_rgb565_b] | 199 load_immq mm7, mmx32_rgb565_b |
200 CLEANUP_IMMQ_LOADS(3) | |
187 | 201 |
188 mov edx, ecx | 202 mov edx, ecx |
189 shr ecx, 2 | 203 shr ecx, 2 |
190 jnz .L1 | 204 jnz .L1 |
191 jmp .L2 | 205 jmp .L2 |
251 | 265 |
252 ; the 16BGR555 converter is identical to the RGB555 one, | 266 ; the 16BGR555 converter is identical to the RGB555 one, |
253 ; except it uses a different multiplier for the pmaddwd | 267 ; except it uses a different multiplier for the pmaddwd |
254 ; instruction. cool huh. | 268 ; instruction. cool huh. |
255 | 269 |
256 movq mm7, qword [mmx32_bgr555_mul] | 270 load_immq mm7, mmx32_bgr555_mul |
257 jmp _convert_bgr555_cheat | 271 jmp _convert_bgr555_cheat |
258 | 272 |
259 ; This is the same as the Intel version.. they obviously went to | 273 ; This is the same as the Intel version.. they obviously went to |
260 ; much more trouble to expand/coil the loop than I did, so theirs | 274 ; much more trouble to expand/coil the loop than I did, so theirs |
261 ; would almost certainly be faster, even if only a little. | 275 ; would almost certainly be faster, even if only a little. |
262 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is | 276 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is |
263 ; (I think) a more accurate name.. | 277 ; (I think) a more accurate name.. |
264 _ConvertMMXpII32_16RGB555: | 278 _ConvertMMXpII32_16RGB555: |
265 | 279 |
266 movq mm7,qword [mmx32_rgb555_mul] | 280 load_immq mm7, mmx32_rgb555_mul |
267 _convert_bgr555_cheat: | 281 _convert_bgr555_cheat: |
268 movq mm6,qword [mmx32_rgb555_g] | 282 load_immq mm6, mmx32_rgb555_g |
283 CLEANUP_IMMQ_LOADS(2) | |
269 | 284 |
270 mov edx,ecx ; Save ecx | 285 mov edx,ecx ; Save ecx |
271 | 286 |
272 and ecx,BYTE 0fffffff8h ; clear lower three bits | 287 and ecx,BYTE 0fffffff8h ; clear lower three bits |
273 jnz .L_OK | 288 jnz .L_OK |
278 movq mm2,[esi+8] | 293 movq mm2,[esi+8] |
279 | 294 |
280 movq mm0,[esi] | 295 movq mm0,[esi] |
281 movq mm3,mm2 | 296 movq mm3,mm2 |
282 | 297 |
283 pand mm3,qword [mmx32_rgb555_rb] | 298 pand_immq mm3, mmx32_rgb555_rb |
284 movq mm1,mm0 | 299 movq mm1,mm0 |
285 | 300 |
286 pand mm1,qword [mmx32_rgb555_rb] | 301 pand_immq mm1, mmx32_rgb555_rb |
287 pmaddwd mm3,mm7 | 302 pmaddwd mm3,mm7 |
303 | |
304 CLEANUP_IMMQ_LOADS(2) | |
288 | 305 |
289 pmaddwd mm1,mm7 | 306 pmaddwd mm1,mm7 |
290 pand mm2,mm6 | 307 pand mm2,mm6 |
291 | 308 |
292 .L1: | 309 .L1: |
300 por mm1,mm0 | 317 por mm1,mm0 |
301 | 318 |
302 movq mm0,mm4 | 319 movq mm0,mm4 |
303 psrld mm1,6 | 320 psrld mm1,6 |
304 | 321 |
305 pand mm0,qword [mmx32_rgb555_rb] | 322 pand_immq mm0, mmx32_rgb555_rb |
306 packssdw mm1,mm3 | 323 packssdw mm1,mm3 |
307 | 324 |
308 movq mm3,mm5 | 325 movq mm3,mm5 |
309 pmaddwd mm0,mm7 | 326 pmaddwd mm0,mm7 |
310 | 327 |
311 pand mm3,qword [mmx32_rgb555_rb] | 328 pand_immq mm3, mmx32_rgb555_rb |
312 pand mm4,mm6 | 329 pand mm4,mm6 |
313 | 330 |
314 movq [edi],mm1 | 331 movq [edi],mm1 |
315 pmaddwd mm3,mm7 | 332 pmaddwd mm3,mm7 |
316 | 333 |
327 psrld mm5,6 | 344 psrld mm5,6 |
328 | 345 |
329 movq mm3,mm2 | 346 movq mm3,mm2 |
330 movq mm1,mm0 | 347 movq mm1,mm0 |
331 | 348 |
332 pand mm3,qword [mmx32_rgb555_rb] | 349 pand_immq mm3, mmx32_rgb555_rb |
333 packssdw mm5,mm4 | 350 packssdw mm5,mm4 |
334 | 351 |
335 pand mm1,qword [mmx32_rgb555_rb] | 352 pand_immq mm1, mmx32_rgb555_rb |
336 pand mm2,mm6 | 353 pand mm2,mm6 |
354 | |
355 CLEANUP_IMMQ_LOADS(4) | |
337 | 356 |
338 movq [edi+8],mm5 | 357 movq [edi+8],mm5 |
339 pmaddwd mm3,mm7 | 358 pmaddwd mm3,mm7 |
340 | 359 |
341 pmaddwd mm1,mm7 | 360 pmaddwd mm1,mm7 |