Mercurial > sdl-ios-xcode
annotate src/hermes/mmxp2_32.asm @ 885:9f6ad2286011
Date: Wed, 28 Apr 2004 16:52:41 -0400
From: "Damien A"
Subject: testdyngl fix
The test program you included in the latest version of SDL crashes on startup in Window (XP). The reason for this is that OpenGL functions on Windows use the __stdcall calling convention, not the C convention. Placing APIENTRY infront of the * operator solves this problem.
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Fri, 30 Apr 2004 18:33:30 +0000 |
parents | 77b6110c797d |
children | da33b7e6d181 |
rev | line source |
---|---|
0 | 1 ; |
2 ; pII-optimised MMX format converters for HERMES | |
3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) | |
4 ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) | |
5 ; This source code is licensed under the GNU LGPL | |
6 ; | |
7 ; Please refer to the file COPYING.LIB contained in the distribution for | |
8 ; licensing conditions | |
9 ; | |
10 ; COPYRIGHT NOTICE | |
11 ; | |
12 ; This file partly contains code that is (c) Intel Corporation, specifically | |
13 ; the mode detection routine, and the converter to 15 bit (8 pixel | |
14 ; conversion routine from the mmx programming tutorial pages). | |
15 ; | |
16 ; | |
17 ; These routines aren't exactly pII optimised - it's just that as they | |
18 ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to | |
19 ; optimise them for p5 MMXs.. | |
20 | |
21 BITS 32 | |
22 | |
23 | |
24 GLOBAL _ConvertMMXpII32_24RGB888 | |
25 GLOBAL _ConvertMMXpII32_16RGB565 | |
26 GLOBAL _ConvertMMXpII32_16BGR565 | |
27 GLOBAL _ConvertMMXpII32_16RGB555 | |
28 GLOBAL _ConvertMMXpII32_16BGR555 | |
29 | |
30 EXTERN _mmxreturn | |
31 | |
32 SECTION .data | |
33 | |
34 ALIGN 8 | |
35 | |
36 ;; Constants for conversion routines | |
37 | |
38 mmx32_rgb888_mask dd 00ffffffh,00ffffffh | |
39 | |
40 mmx32_rgb565_b dd 000000f8h, 000000f8h | |
41 mmx32_rgb565_g dd 0000fc00h, 0000fc00h | |
42 mmx32_rgb565_r dd 00f80000h, 00f80000h | |
43 | |
44 mmx32_rgb555_rb dd 00f800f8h,00f800f8h | |
45 mmx32_rgb555_g dd 0000f800h,0000f800h | |
46 mmx32_rgb555_mul dd 20000008h,20000008h | |
47 mmx32_bgr555_mul dd 00082000h,00082000h | |
48 | |
49 | |
50 | |
51 SECTION .text | |
52 | |
53 _ConvertMMXpII32_24RGB888: | |
54 | |
55 ; set up mm6 as the mask, mm7 as zero | |
56 movq mm6, qword [mmx32_rgb888_mask] | |
57 pxor mm7, mm7 | |
58 | |
59 mov edx, ecx ; save ecx | |
60 and ecx, 0fffffffch ; clear lower two bits | |
61 jnz .L1 | |
62 jmp .L2 | |
63 | |
64 .L1: | |
65 | |
66 movq mm0, [esi] ; A R G B a r g b | |
67 pand mm0, mm6 ; 0 R G B 0 r g b | |
68 movq mm1, [esi+8] ; A R G B a r g b | |
69 pand mm1, mm6 ; 0 R G B 0 r g b | |
70 | |
71 movq mm2, mm0 ; 0 R G B 0 r g b | |
72 punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B | |
73 punpckldq mm0, mm7 ; 0 0 0 0 0 r g b | |
74 psllq mm2, 24 ; 0 0 R G B 0 0 0 | |
75 por mm0, mm2 ; 0 0 R G B r g b | |
76 | |
77 movq mm3, mm1 ; 0 R G B 0 r g b | |
78 psllq mm3, 48 ; g b 0 0 0 0 0 0 | |
79 por mm0, mm3 ; g b R G B r g b | |
80 | |
81 movq mm4, mm1 ; 0 R G B 0 r g b | |
82 punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B | |
83 punpckldq mm1, mm7 ; 0 0 0 0 0 r g b | |
84 psrlq mm1, 16 ; 0 0 0 R G B 0 r | |
85 psllq mm4, 8 ; 0 0 0 0 R G B 0 | |
86 por mm1, mm4 ; 0 0 0 0 R G B r | |
87 | |
88 movq [edi], mm0 | |
89 add esi, BYTE 16 | |
90 movd [edi+8], mm1 | |
91 add edi, BYTE 12 | |
92 sub ecx, BYTE 4 | |
93 jnz .L1 | |
94 | |
95 .L2: | |
96 mov ecx, edx | |
97 and ecx, BYTE 3 | |
98 jz .L4 | |
99 .L3: | |
100 mov al, [esi] | |
101 mov bl, [esi+1] | |
102 mov dl, [esi+2] | |
103 mov [edi], al | |
104 mov [edi+1], bl | |
105 mov [edi+2], dl | |
106 add esi, BYTE 4 | |
107 add edi, BYTE 3 | |
108 dec ecx | |
109 jnz .L3 | |
110 .L4: | |
111 jmp _mmxreturn | |
112 | |
113 | |
114 | |
115 _ConvertMMXpII32_16RGB565: | |
116 | |
117 ; set up masks | |
118 movq mm5, [mmx32_rgb565_b] | |
119 movq mm6, [mmx32_rgb565_g] | |
120 movq mm7, [mmx32_rgb565_r] | |
121 | |
122 mov edx, ecx | |
123 shr ecx, 2 | |
124 jnz .L1 | |
125 jmp .L2 ; not necessary at the moment, but doesn't hurt (much) | |
126 | |
127 .L1: | |
128 movq mm0, [esi] ; argb | |
129 movq mm1, mm0 ; argb | |
130 pand mm0, mm6 ; 00g0 | |
131 movq mm3, mm1 ; argb | |
132 pand mm1, mm5 ; 000b | |
133 pand mm3, mm7 ; 0r00 | |
134 pslld mm1, 2 ; 0 0 000000bb bbb00000 | |
135 por mm0, mm1 ; 0 0 ggggggbb bbb00000 | |
136 psrld mm0, 5 ; 0 0 00000ggg gggbbbbb | |
137 | |
138 movq mm4, [esi+8] ; argb | |
139 movq mm2, mm4 ; argb | |
140 pand mm4, mm6 ; 00g0 | |
141 movq mm1, mm2 ; argb | |
142 pand mm2, mm5 ; 000b | |
143 pand mm1, mm7 ; 0r00 | |
144 pslld mm2, 2 ; 0 0 000000bb bbb00000 | |
145 por mm4, mm2 ; 0 0 ggggggbb bbb00000 | |
146 psrld mm4, 5 ; 0 0 00000ggg gggbbbbb | |
147 | |
148 packuswb mm3, mm1 ; R 0 r 0 | |
149 packssdw mm0, mm4 ; as above.. ish | |
150 por mm0, mm3 ; done. | |
151 movq [edi], mm0 | |
152 | |
153 add esi, 16 | |
154 add edi, 8 | |
155 dec ecx | |
156 jnz .L1 | |
157 | |
158 .L2: | |
159 mov ecx, edx | |
160 and ecx, BYTE 3 | |
161 jz .L4 | |
162 .L3: | |
163 mov al, [esi] | |
164 mov bh, [esi+1] | |
165 mov ah, [esi+2] | |
166 shr al, 3 | |
167 and eax, 0F81Fh ; BYTE? | |
168 shr ebx, 5 | |
169 and ebx, 07E0h ; BYTE? | |
170 add eax, ebx | |
171 mov [edi], al | |
172 mov [edi+1], ah | |
173 add esi, BYTE 4 | |
174 add edi, BYTE 2 | |
175 dec ecx | |
176 jnz .L3 | |
177 | |
178 .L4: | |
179 jmp _mmxreturn | |
180 | |
181 | |
182 _ConvertMMXpII32_16BGR565: | |
183 | |
184 movq mm5, [mmx32_rgb565_r] | |
185 movq mm6, [mmx32_rgb565_g] | |
186 movq mm7, [mmx32_rgb565_b] | |
187 | |
188 mov edx, ecx | |
189 shr ecx, 2 | |
190 jnz .L1 | |
191 jmp .L2 | |
192 | |
193 .L1: | |
194 movq mm0, [esi] ; a r g b | |
195 movq mm1, mm0 ; a r g b | |
196 pand mm0, mm6 ; 0 0 g 0 | |
197 movq mm3, mm1 ; a r g b | |
198 pand mm1, mm5 ; 0 r 0 0 | |
199 pand mm3, mm7 ; 0 0 0 b | |
200 | |
201 psllq mm3, 16 ; 0 b 0 0 | |
202 psrld mm1, 14 ; 0 0 000000rr rrr00000 | |
203 por mm0, mm1 ; 0 0 ggggggrr rrr00000 | |
204 psrld mm0, 5 ; 0 0 00000ggg gggrrrrr | |
205 | |
206 movq mm4, [esi+8] ; a r g b | |
207 movq mm2, mm4 ; a r g b | |
208 pand mm4, mm6 ; 0 0 g 0 | |
209 movq mm1, mm2 ; a r g b | |
210 pand mm2, mm5 ; 0 r 0 0 | |
211 pand mm1, mm7 ; 0 0 0 b | |
212 | |
213 psllq mm1, 16 ; 0 b 0 0 | |
214 psrld mm2, 14 ; 0 0 000000rr rrr00000 | |
215 por mm4, mm2 ; 0 0 ggggggrr rrr00000 | |
216 psrld mm4, 5 ; 0 0 00000ggg gggrrrrr | |
217 | |
218 packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 | |
219 packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR | |
220 por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr | |
221 movq [edi], mm0 | |
222 | |
223 add esi, BYTE 16 | |
224 add edi, BYTE 8 | |
225 dec ecx | |
226 jnz .L1 | |
227 | |
228 .L2: | |
229 and edx, BYTE 3 | |
230 jz .L4 | |
231 .L3: | |
232 mov al, [esi+2] | |
233 mov bh, [esi+1] | |
234 mov ah, [esi] | |
235 shr al, 3 | |
236 and eax, 0F81Fh ; BYTE ? | |
237 shr ebx, 5 | |
238 and ebx, 07E0h ; BYTE ? | |
239 add eax, ebx | |
240 mov [edi], al | |
241 mov [edi+1], ah | |
242 add esi, BYTE 4 | |
243 add edi, BYTE 2 | |
244 dec edx | |
245 jnz .L3 | |
246 | |
247 .L4: | |
248 jmp _mmxreturn | |
249 | |
250 _ConvertMMXpII32_16BGR555: | |
251 | |
252 ; the 16BGR555 converter is identical to the RGB555 one, | |
253 ; except it uses a different multiplier for the pmaddwd | |
254 ; instruction. cool huh. | |
255 | |
256 movq mm7, qword [mmx32_bgr555_mul] | |
257 jmp _convert_bgr555_cheat | |
258 | |
259 ; This is the same as the Intel version.. they obviously went to | |
260 ; much more trouble to expand/coil the loop than I did, so theirs | |
261 ; would almost certainly be faster, even if only a little. | |
262 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is | |
263 ; (I think) a more accurate name.. | |
264 _ConvertMMXpII32_16RGB555: | |
265 | |
266 movq mm7,qword [mmx32_rgb555_mul] | |
267 _convert_bgr555_cheat: | |
268 movq mm6,qword [mmx32_rgb555_g] | |
269 | |
270 mov edx,ecx ; Save ecx | |
271 | |
272 and ecx,BYTE 0fffffff8h ; clear lower three bits | |
273 jnz .L_OK | |
289
77b6110c797d
Fixed "short jump out of range" error in MMX code
Sam Lantinga <slouken@libsdl.org>
parents:
0
diff
changeset
|
274 jmp near .L2 |
0 | 275 |
276 .L_OK: | |
277 | |
278 movq mm2,[esi+8] | |
279 | |
280 movq mm0,[esi] | |
281 movq mm3,mm2 | |
282 | |
283 pand mm3,qword [mmx32_rgb555_rb] | |
284 movq mm1,mm0 | |
285 | |
286 pand mm1,qword [mmx32_rgb555_rb] | |
287 pmaddwd mm3,mm7 | |
288 | |
289 pmaddwd mm1,mm7 | |
290 pand mm2,mm6 | |
291 | |
292 .L1: | |
293 movq mm4,[esi+24] | |
294 pand mm0,mm6 | |
295 | |
296 movq mm5,[esi+16] | |
297 por mm3,mm2 | |
298 | |
299 psrld mm3,6 | |
300 por mm1,mm0 | |
301 | |
302 movq mm0,mm4 | |
303 psrld mm1,6 | |
304 | |
305 pand mm0,qword [mmx32_rgb555_rb] | |
306 packssdw mm1,mm3 | |
307 | |
308 movq mm3,mm5 | |
309 pmaddwd mm0,mm7 | |
310 | |
311 pand mm3,qword [mmx32_rgb555_rb] | |
312 pand mm4,mm6 | |
313 | |
314 movq [edi],mm1 | |
315 pmaddwd mm3,mm7 | |
316 | |
317 add esi,BYTE 32 | |
318 por mm4,mm0 | |
319 | |
320 pand mm5,mm6 | |
321 psrld mm4,6 | |
322 | |
323 movq mm2,[esi+8] | |
324 por mm5,mm3 | |
325 | |
326 movq mm0,[esi] | |
327 psrld mm5,6 | |
328 | |
329 movq mm3,mm2 | |
330 movq mm1,mm0 | |
331 | |
332 pand mm3,qword [mmx32_rgb555_rb] | |
333 packssdw mm5,mm4 | |
334 | |
335 pand mm1,qword [mmx32_rgb555_rb] | |
336 pand mm2,mm6 | |
337 | |
338 movq [edi+8],mm5 | |
339 pmaddwd mm3,mm7 | |
340 | |
341 pmaddwd mm1,mm7 | |
342 add edi,BYTE 16 | |
343 | |
344 sub ecx,BYTE 8 | |
345 jz .L2 | |
346 jmp .L1 | |
347 | |
348 | |
349 .L2: | |
350 mov ecx,edx | |
351 | |
352 and ecx,BYTE 7 | |
353 jz .L4 | |
354 | |
355 .L3: | |
356 mov ebx,[esi] | |
357 add esi,BYTE 4 | |
358 | |
359 mov eax,ebx | |
360 mov edx,ebx | |
361 | |
362 shr eax,3 | |
363 shr edx,6 | |
364 | |
365 and eax,BYTE 0000000000011111b | |
366 and edx, 0000001111100000b | |
367 | |
368 shr ebx,9 | |
369 | |
370 or eax,edx | |
371 | |
372 and ebx, 0111110000000000b | |
373 | |
374 or eax,ebx | |
375 | |
376 mov [edi],ax | |
377 add edi,BYTE 2 | |
378 | |
379 dec ecx | |
380 jnz .L3 | |
381 | |
382 .L4: | |
383 jmp _mmxreturn | |
384 | |
385 | |
386 |