comparison src/hermes/mmxp2_32.asm @ 1230:88c2d6aed428

From Mike Frysinger and/or Gentoo: - libsdl-PIC-load-mmx-masks-from-stack.patch this one may be a little controversial ... the fix here is again that you cant reference the memory addresses like this to load into a mmx register, so the way to do it is to push two 32bit words onto the stack, load the 64bit value off of the stack into the mmx register, and then adjust the stack so that it's back to normal.
author Ryan C. Gordon <icculus@icculus.org>
date Thu, 05 Jan 2006 15:25:19 +0000
parents 2d6dc7de1145
children 393092a3ebf6
comparison
equal deleted inserted replaced
1229:1430f5fe092a 1230:88c2d6aed428
27 GLOBAL _ConvertMMXpII32_16RGB555 27 GLOBAL _ConvertMMXpII32_16RGB555
28 GLOBAL _ConvertMMXpII32_16BGR555 28 GLOBAL _ConvertMMXpII32_16BGR555
29 29
30 EXTERN _mmxreturn 30 EXTERN _mmxreturn
31 31
32 SECTION .data 32 ;; Macros for conversion routines
33 33
34 ALIGN 8 34 %macro _push_immq_mask 1
35 35 push dword %1
36 ;; Constants for conversion routines 36 push dword %1
37 37 %endmacro
38 mmx32_rgb888_mask dd 00ffffffh,00ffffffh 38
39 39 %macro load_immq 2
40 mmx32_rgb565_b dd 000000f8h, 000000f8h 40 _push_immq_mask %2
41 mmx32_rgb565_g dd 0000fc00h, 0000fc00h 41 movq %1, [esp]
42 mmx32_rgb565_r dd 00f80000h, 00f80000h 42 %endmacro
43 43
44 mmx32_rgb555_rb dd 00f800f8h,00f800f8h 44 %macro pand_immq 2
45 mmx32_rgb555_g dd 0000f800h,0000f800h 45 _push_immq_mask %2
46 mmx32_rgb555_mul dd 20000008h,20000008h 46 pand %1, [esp]
47 mmx32_bgr555_mul dd 00082000h,00082000h 47 %endmacro
48 48
49 49 %define CLEANUP_IMMQ_LOADS(num) \
50 50 add esp, byte 8 * num
51
52 %define mmx32_rgb888_mask 00ffffffh
53 %define mmx32_rgb565_b 000000f8h
54 %define mmx32_rgb565_g 0000fc00h
55 %define mmx32_rgb565_r 00f80000h
56
57 %define mmx32_rgb555_rb 00f800f8h
58 %define mmx32_rgb555_g 0000f800h
59 %define mmx32_rgb555_mul 20000008h
60 %define mmx32_bgr555_mul 00082000h
61
51 SECTION .text 62 SECTION .text
52 63
53 _ConvertMMXpII32_24RGB888: 64 _ConvertMMXpII32_24RGB888:
54 65
55 ; set up mm6 as the mask, mm7 as zero 66 ; set up mm6 as the mask, mm7 as zero
56 movq mm6, qword [mmx32_rgb888_mask] 67 load_immq mm6, mmx32_rgb888_mask
68 CLEANUP_IMMQ_LOADS(1)
57 pxor mm7, mm7 69 pxor mm7, mm7
58 70
59 mov edx, ecx ; save ecx 71 mov edx, ecx ; save ecx
60 and ecx, 0fffffffch ; clear lower two bits 72 and ecx, 0fffffffch ; clear lower two bits
61 jnz .L1 73 jnz .L1
113 125
114 126
115 _ConvertMMXpII32_16RGB565: 127 _ConvertMMXpII32_16RGB565:
116 128
117 ; set up masks 129 ; set up masks
118 movq mm5, [mmx32_rgb565_b] 130 load_immq mm5, mmx32_rgb565_b
119 movq mm6, [mmx32_rgb565_g] 131 load_immq mm6, mmx32_rgb565_g
120 movq mm7, [mmx32_rgb565_r] 132 load_immq mm7, mmx32_rgb565_r
133 CLEANUP_IMMQ_LOADS(3)
121 134
122 mov edx, ecx 135 mov edx, ecx
123 shr ecx, 2 136 shr ecx, 2
124 jnz .L1 137 jnz .L1
125 jmp .L2 ; not necessary at the moment, but doesn't hurt (much) 138 jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
179 jmp _mmxreturn 192 jmp _mmxreturn
180 193
181 194
182 _ConvertMMXpII32_16BGR565: 195 _ConvertMMXpII32_16BGR565:
183 196
184 movq mm5, [mmx32_rgb565_r] 197 load_immq mm5, mmx32_rgb565_r
185 movq mm6, [mmx32_rgb565_g] 198 load_immq mm6, mmx32_rgb565_g
186 movq mm7, [mmx32_rgb565_b] 199 load_immq mm7, mmx32_rgb565_b
200 CLEANUP_IMMQ_LOADS(3)
187 201
188 mov edx, ecx 202 mov edx, ecx
189 shr ecx, 2 203 shr ecx, 2
190 jnz .L1 204 jnz .L1
191 jmp .L2 205 jmp .L2
251 265
252 ; the 16BGR555 converter is identical to the RGB555 one, 266 ; the 16BGR555 converter is identical to the RGB555 one,
253 ; except it uses a different multiplier for the pmaddwd 267 ; except it uses a different multiplier for the pmaddwd
254 ; instruction. cool huh. 268 ; instruction. cool huh.
255 269
256 movq mm7, qword [mmx32_bgr555_mul] 270 load_immq mm7, mmx32_bgr555_mul
257 jmp _convert_bgr555_cheat 271 jmp _convert_bgr555_cheat
258 272
259 ; This is the same as the Intel version.. they obviously went to 273 ; This is the same as the Intel version.. they obviously went to
260 ; much more trouble to expand/coil the loop than I did, so theirs 274 ; much more trouble to expand/coil the loop than I did, so theirs
261 ; would almost certainly be faster, even if only a little. 275 ; would almost certainly be faster, even if only a little.
262 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is 276 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
263 ; (I think) a more accurate name.. 277 ; (I think) a more accurate name..
264 _ConvertMMXpII32_16RGB555: 278 _ConvertMMXpII32_16RGB555:
265 279
266 movq mm7,qword [mmx32_rgb555_mul] 280 load_immq mm7, mmx32_rgb555_mul
267 _convert_bgr555_cheat: 281 _convert_bgr555_cheat:
268 movq mm6,qword [mmx32_rgb555_g] 282 load_immq mm6, mmx32_rgb555_g
283 CLEANUP_IMMQ_LOADS(2)
269 284
270 mov edx,ecx ; Save ecx 285 mov edx,ecx ; Save ecx
271 286
272 and ecx,BYTE 0fffffff8h ; clear lower three bits 287 and ecx,BYTE 0fffffff8h ; clear lower three bits
273 jnz .L_OK 288 jnz .L_OK
278 movq mm2,[esi+8] 293 movq mm2,[esi+8]
279 294
280 movq mm0,[esi] 295 movq mm0,[esi]
281 movq mm3,mm2 296 movq mm3,mm2
282 297
283 pand mm3,qword [mmx32_rgb555_rb] 298 pand_immq mm3, mmx32_rgb555_rb
284 movq mm1,mm0 299 movq mm1,mm0
285 300
286 pand mm1,qword [mmx32_rgb555_rb] 301 pand_immq mm1, mmx32_rgb555_rb
287 pmaddwd mm3,mm7 302 pmaddwd mm3,mm7
303
304 CLEANUP_IMMQ_LOADS(2)
288 305
289 pmaddwd mm1,mm7 306 pmaddwd mm1,mm7
290 pand mm2,mm6 307 pand mm2,mm6
291 308
292 .L1: 309 .L1:
300 por mm1,mm0 317 por mm1,mm0
301 318
302 movq mm0,mm4 319 movq mm0,mm4
303 psrld mm1,6 320 psrld mm1,6
304 321
305 pand mm0,qword [mmx32_rgb555_rb] 322 pand_immq mm0, mmx32_rgb555_rb
306 packssdw mm1,mm3 323 packssdw mm1,mm3
307 324
308 movq mm3,mm5 325 movq mm3,mm5
309 pmaddwd mm0,mm7 326 pmaddwd mm0,mm7
310 327
311 pand mm3,qword [mmx32_rgb555_rb] 328 pand_immq mm3, mmx32_rgb555_rb
312 pand mm4,mm6 329 pand mm4,mm6
313 330
314 movq [edi],mm1 331 movq [edi],mm1
315 pmaddwd mm3,mm7 332 pmaddwd mm3,mm7
316 333
327 psrld mm5,6 344 psrld mm5,6
328 345
329 movq mm3,mm2 346 movq mm3,mm2
330 movq mm1,mm0 347 movq mm1,mm0
331 348
332 pand mm3,qword [mmx32_rgb555_rb] 349 pand_immq mm3, mmx32_rgb555_rb
333 packssdw mm5,mm4 350 packssdw mm5,mm4
334 351
335 pand mm1,qword [mmx32_rgb555_rb] 352 pand_immq mm1, mmx32_rgb555_rb
336 pand mm2,mm6 353 pand mm2,mm6
354
355 CLEANUP_IMMQ_LOADS(4)
337 356
338 movq [edi+8],mm5 357 movq [edi+8],mm5
339 pmaddwd mm3,mm7 358 pmaddwd mm3,mm7
340 359
341 pmaddwd mm1,mm7 360 pmaddwd mm1,mm7