Mercurial > sdl-ios-xcode
annotate src/hermes/mmxp2_32.asm @ 526:4314a501d7be
Fixed a crash blitting RLE surfaces to RLE surfaces
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Fri, 11 Oct 2002 07:56:36 +0000 |
parents | 77b6110c797d |
children | da33b7e6d181 |
rev | line source |
---|---|
0 | 1 ; |
2 ; pII-optimised MMX format converters for HERMES | |
3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) | |
4 ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) | |
5 ; This source code is licensed under the GNU LGPL | |
6 ; | |
7 ; Please refer to the file COPYING.LIB contained in the distribution for | |
8 ; licensing conditions | |
9 ; | |
10 ; COPYRIGHT NOTICE | |
11 ; | |
12 ; This file partly contains code that is (c) Intel Corporation, specifically | |
13 ; the mode detection routine, and the converter to 15 bit (8 pixel | |
14 ; conversion routine from the mmx programming tutorial pages). | |
15 ; | |
16 ; | |
17 ; These routines aren't exactly pII optimised - it's just that as they | |
18 ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to | |
19 ; optimise them for p5 MMXs.. | |
20 | |
21 BITS 32 | |
22 | |
23 | |
24 GLOBAL _ConvertMMXpII32_24RGB888 | |
25 GLOBAL _ConvertMMXpII32_16RGB565 | |
26 GLOBAL _ConvertMMXpII32_16BGR565 | |
27 GLOBAL _ConvertMMXpII32_16RGB555 | |
28 GLOBAL _ConvertMMXpII32_16BGR555 | |
29 | |
30 EXTERN _mmxreturn | |
31 | |
32 SECTION .data | |
33 | |
34 ALIGN 8 | |
35 | |
36 ;; Constants for conversion routines | |
37 | |
38 mmx32_rgb888_mask dd 00ffffffh,00ffffffh | |
39 | |
40 mmx32_rgb565_b dd 000000f8h, 000000f8h | |
41 mmx32_rgb565_g dd 0000fc00h, 0000fc00h | |
42 mmx32_rgb565_r dd 00f80000h, 00f80000h | |
43 | |
44 mmx32_rgb555_rb dd 00f800f8h,00f800f8h | |
45 mmx32_rgb555_g dd 0000f800h,0000f800h | |
46 mmx32_rgb555_mul dd 20000008h,20000008h | |
47 mmx32_bgr555_mul dd 00082000h,00082000h | |
48 | |
49 | |
50 | |
51 SECTION .text | |
52 | |
53 _ConvertMMXpII32_24RGB888: | |
54 | |
55 ; set up mm6 as the mask, mm7 as zero | |
56 movq mm6, qword [mmx32_rgb888_mask] | |
57 pxor mm7, mm7 | |
58 | |
59 mov edx, ecx ; save ecx | |
60 and ecx, 0fffffffch ; clear lower two bits | |
61 jnz .L1 | |
62 jmp .L2 | |
63 | |
64 .L1: | |
65 | |
66 movq mm0, [esi] ; A R G B a r g b | |
67 pand mm0, mm6 ; 0 R G B 0 r g b | |
68 movq mm1, [esi+8] ; A R G B a r g b | |
69 pand mm1, mm6 ; 0 R G B 0 r g b | |
70 | |
71 movq mm2, mm0 ; 0 R G B 0 r g b | |
72 punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B | |
73 punpckldq mm0, mm7 ; 0 0 0 0 0 r g b | |
74 psllq mm2, 24 ; 0 0 R G B 0 0 0 | |
75 por mm0, mm2 ; 0 0 R G B r g b | |
76 | |
77 movq mm3, mm1 ; 0 R G B 0 r g b | |
78 psllq mm3, 48 ; g b 0 0 0 0 0 0 | |
79 por mm0, mm3 ; g b R G B r g b | |
80 | |
81 movq mm4, mm1 ; 0 R G B 0 r g b | |
82 punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B | |
83 punpckldq mm1, mm7 ; 0 0 0 0 0 r g b | |
84 psrlq mm1, 16 ; 0 0 0 R G B 0 r | |
85 psllq mm4, 8 ; 0 0 0 0 R G B 0 | |
86 por mm1, mm4 ; 0 0 0 0 R G B r | |
87 | |
88 movq [edi], mm0 | |
89 add esi, BYTE 16 | |
90 movd [edi+8], mm1 | |
91 add edi, BYTE 12 | |
92 sub ecx, BYTE 4 | |
93 jnz .L1 | |
94 | |
95 .L2: | |
96 mov ecx, edx | |
97 and ecx, BYTE 3 | |
98 jz .L4 | |
99 .L3: | |
100 mov al, [esi] | |
101 mov bl, [esi+1] | |
102 mov dl, [esi+2] | |
103 mov [edi], al | |
104 mov [edi+1], bl | |
105 mov [edi+2], dl | |
106 add esi, BYTE 4 | |
107 add edi, BYTE 3 | |
108 dec ecx | |
109 jnz .L3 | |
110 .L4: | |
111 jmp _mmxreturn | |
112 | |
113 | |
114 | |
115 _ConvertMMXpII32_16RGB565: | |
116 | |
117 ; set up masks | |
118 movq mm5, [mmx32_rgb565_b] | |
119 movq mm6, [mmx32_rgb565_g] | |
120 movq mm7, [mmx32_rgb565_r] | |
121 | |
122 mov edx, ecx | |
123 shr ecx, 2 | |
124 jnz .L1 | |
125 jmp .L2 ; not necessary at the moment, but doesn't hurt (much) | |
126 | |
127 .L1: | |
128 movq mm0, [esi] ; argb | |
129 movq mm1, mm0 ; argb | |
130 pand mm0, mm6 ; 00g0 | |
131 movq mm3, mm1 ; argb | |
132 pand mm1, mm5 ; 000b | |
133 pand mm3, mm7 ; 0r00 | |
134 pslld mm1, 2 ; 0 0 000000bb bbb00000 | |
135 por mm0, mm1 ; 0 0 ggggggbb bbb00000 | |
136 psrld mm0, 5 ; 0 0 00000ggg gggbbbbb | |
137 | |
138 movq mm4, [esi+8] ; argb | |
139 movq mm2, mm4 ; argb | |
140 pand mm4, mm6 ; 00g0 | |
141 movq mm1, mm2 ; argb | |
142 pand mm2, mm5 ; 000b | |
143 pand mm1, mm7 ; 0r00 | |
144 pslld mm2, 2 ; 0 0 000000bb bbb00000 | |
145 por mm4, mm2 ; 0 0 ggggggbb bbb00000 | |
146 psrld mm4, 5 ; 0 0 00000ggg gggbbbbb | |
147 | |
148 packuswb mm3, mm1 ; R 0 r 0 | |
149 packssdw mm0, mm4 ; as above.. ish | |
150 por mm0, mm3 ; done. | |
151 movq [edi], mm0 | |
152 | |
153 add esi, 16 | |
154 add edi, 8 | |
155 dec ecx | |
156 jnz .L1 | |
157 | |
158 .L2: | |
159 mov ecx, edx | |
160 and ecx, BYTE 3 | |
161 jz .L4 | |
162 .L3: | |
163 mov al, [esi] | |
164 mov bh, [esi+1] | |
165 mov ah, [esi+2] | |
166 shr al, 3 | |
167 and eax, 0F81Fh ; BYTE? | |
168 shr ebx, 5 | |
169 and ebx, 07E0h ; BYTE? | |
170 add eax, ebx | |
171 mov [edi], al | |
172 mov [edi+1], ah | |
173 add esi, BYTE 4 | |
174 add edi, BYTE 2 | |
175 dec ecx | |
176 jnz .L3 | |
177 | |
178 .L4: | |
179 jmp _mmxreturn | |
180 | |
181 | |
182 _ConvertMMXpII32_16BGR565: | |
183 | |
184 movq mm5, [mmx32_rgb565_r] | |
185 movq mm6, [mmx32_rgb565_g] | |
186 movq mm7, [mmx32_rgb565_b] | |
187 | |
188 mov edx, ecx | |
189 shr ecx, 2 | |
190 jnz .L1 | |
191 jmp .L2 | |
192 | |
193 .L1: | |
194 movq mm0, [esi] ; a r g b | |
195 movq mm1, mm0 ; a r g b | |
196 pand mm0, mm6 ; 0 0 g 0 | |
197 movq mm3, mm1 ; a r g b | |
198 pand mm1, mm5 ; 0 r 0 0 | |
199 pand mm3, mm7 ; 0 0 0 b | |
200 | |
201 psllq mm3, 16 ; 0 b 0 0 | |
202 psrld mm1, 14 ; 0 0 000000rr rrr00000 | |
203 por mm0, mm1 ; 0 0 ggggggrr rrr00000 | |
204 psrld mm0, 5 ; 0 0 00000ggg gggrrrrr | |
205 | |
206 movq mm4, [esi+8] ; a r g b | |
207 movq mm2, mm4 ; a r g b | |
208 pand mm4, mm6 ; 0 0 g 0 | |
209 movq mm1, mm2 ; a r g b | |
210 pand mm2, mm5 ; 0 r 0 0 | |
211 pand mm1, mm7 ; 0 0 0 b | |
212 | |
213 psllq mm1, 16 ; 0 b 0 0 | |
214 psrld mm2, 14 ; 0 0 000000rr rrr00000 | |
215 por mm4, mm2 ; 0 0 ggggggrr rrr00000 | |
216 psrld mm4, 5 ; 0 0 00000ggg gggrrrrr | |
217 | |
218 packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 | |
219 packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR | |
220 por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr | |
221 movq [edi], mm0 | |
222 | |
223 add esi, BYTE 16 | |
224 add edi, BYTE 8 | |
225 dec ecx | |
226 jnz .L1 | |
227 | |
228 .L2: | |
229 and edx, BYTE 3 | |
230 jz .L4 | |
231 .L3: | |
232 mov al, [esi+2] | |
233 mov bh, [esi+1] | |
234 mov ah, [esi] | |
235 shr al, 3 | |
236 and eax, 0F81Fh ; BYTE ? | |
237 shr ebx, 5 | |
238 and ebx, 07E0h ; BYTE ? | |
239 add eax, ebx | |
240 mov [edi], al | |
241 mov [edi+1], ah | |
242 add esi, BYTE 4 | |
243 add edi, BYTE 2 | |
244 dec edx | |
245 jnz .L3 | |
246 | |
247 .L4: | |
248 jmp _mmxreturn | |
249 | |
250 _ConvertMMXpII32_16BGR555: | |
251 | |
252 ; the 16BGR555 converter is identical to the RGB555 one, | |
253 ; except it uses a different multiplier for the pmaddwd | |
254 ; instruction. cool huh. | |
255 | |
256 movq mm7, qword [mmx32_bgr555_mul] | |
257 jmp _convert_bgr555_cheat | |
258 | |
259 ; This is the same as the Intel version.. they obviously went to | |
260 ; much more trouble to expand/coil the loop than I did, so theirs | |
261 ; would almost certainly be faster, even if only a little. | |
262 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is | |
263 ; (I think) a more accurate name.. | |
264 _ConvertMMXpII32_16RGB555: | |
265 | |
266 movq mm7,qword [mmx32_rgb555_mul] | |
267 _convert_bgr555_cheat: | |
268 movq mm6,qword [mmx32_rgb555_g] | |
269 | |
270 mov edx,ecx ; Save ecx | |
271 | |
272 and ecx,BYTE 0fffffff8h ; clear lower three bits | |
273 jnz .L_OK | |
289
77b6110c797d
Fixed "short jump out of range" error in MMX code
Sam Lantinga <slouken@libsdl.org>
parents:
0
diff
changeset
|
274 jmp near .L2 |
0 | 275 |
276 .L_OK: | |
277 | |
278 movq mm2,[esi+8] | |
279 | |
280 movq mm0,[esi] | |
281 movq mm3,mm2 | |
282 | |
283 pand mm3,qword [mmx32_rgb555_rb] | |
284 movq mm1,mm0 | |
285 | |
286 pand mm1,qword [mmx32_rgb555_rb] | |
287 pmaddwd mm3,mm7 | |
288 | |
289 pmaddwd mm1,mm7 | |
290 pand mm2,mm6 | |
291 | |
292 .L1: | |
293 movq mm4,[esi+24] | |
294 pand mm0,mm6 | |
295 | |
296 movq mm5,[esi+16] | |
297 por mm3,mm2 | |
298 | |
299 psrld mm3,6 | |
300 por mm1,mm0 | |
301 | |
302 movq mm0,mm4 | |
303 psrld mm1,6 | |
304 | |
305 pand mm0,qword [mmx32_rgb555_rb] | |
306 packssdw mm1,mm3 | |
307 | |
308 movq mm3,mm5 | |
309 pmaddwd mm0,mm7 | |
310 | |
311 pand mm3,qword [mmx32_rgb555_rb] | |
312 pand mm4,mm6 | |
313 | |
314 movq [edi],mm1 | |
315 pmaddwd mm3,mm7 | |
316 | |
317 add esi,BYTE 32 | |
318 por mm4,mm0 | |
319 | |
320 pand mm5,mm6 | |
321 psrld mm4,6 | |
322 | |
323 movq mm2,[esi+8] | |
324 por mm5,mm3 | |
325 | |
326 movq mm0,[esi] | |
327 psrld mm5,6 | |
328 | |
329 movq mm3,mm2 | |
330 movq mm1,mm0 | |
331 | |
332 pand mm3,qword [mmx32_rgb555_rb] | |
333 packssdw mm5,mm4 | |
334 | |
335 pand mm1,qword [mmx32_rgb555_rb] | |
336 pand mm2,mm6 | |
337 | |
338 movq [edi+8],mm5 | |
339 pmaddwd mm3,mm7 | |
340 | |
341 pmaddwd mm1,mm7 | |
342 add edi,BYTE 16 | |
343 | |
344 sub ecx,BYTE 8 | |
345 jz .L2 | |
346 jmp .L1 | |
347 | |
348 | |
349 .L2: | |
350 mov ecx,edx | |
351 | |
352 and ecx,BYTE 7 | |
353 jz .L4 | |
354 | |
355 .L3: | |
356 mov ebx,[esi] | |
357 add esi,BYTE 4 | |
358 | |
359 mov eax,ebx | |
360 mov edx,ebx | |
361 | |
362 shr eax,3 | |
363 shr edx,6 | |
364 | |
365 and eax,BYTE 0000000000011111b | |
366 and edx, 0000001111100000b | |
367 | |
368 shr ebx,9 | |
369 | |
370 or eax,edx | |
371 | |
372 and ebx, 0111110000000000b | |
373 | |
374 or eax,ebx | |
375 | |
376 mov [edi],ax | |
377 add edi,BYTE 2 | |
378 | |
379 dec ecx | |
380 jnz .L3 | |
381 | |
382 .L4: | |
383 jmp _mmxreturn | |
384 | |
385 | |
386 |