Mercurial > sdl-ios-xcode
annotate src/hermes/mmxp2_32.asm @ 1187:19d8949b4584
To: sdl@libsdl.org
From: Staffan Ulfberg <staffan@ulfberg.se>
Date: 19 Nov 2005 01:00:48 +0100
Subject: [SDL] New driver for OpenBSD/wscons
Hello,
I've written an SDL driver for OpenBSD/wscons (console mode, somewhat
resembling the functionality of the svga driver for Linux). I use it
for playing MAME on my Sharp Zaurus. The alternative is to play under
X, which is slower.
I asked how to submit the driver a few days ago, and posted a link to
the patch in a follow-up, so maybe it was missed?
Anyway, the patch is on the web at:
http://multivac.fatburen.org/SDL-wscons.patch
Comments?
Staffan
author | Ryan C. Gordon <icculus@icculus.org> |
---|---|
date | Tue, 22 Nov 2005 15:19:50 +0000 |
parents | da33b7e6d181 |
children | 2d6dc7de1145 |
rev | line source |
---|---|
0 | 1 ; |
2 ; pII-optimised MMX format converters for HERMES | |
3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) | |
4 ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) | |
5 ; This source code is licensed under the GNU LGPL | |
6 ; | |
7 ; Please refer to the file COPYING.LIB contained in the distribution for | |
8 ; licensing conditions | |
9 ; | |
10 ; COPYRIGHT NOTICE | |
11 ; | |
12 ; This file partly contains code that is (c) Intel Corporation, specifically | |
13 ; the mode detection routine, and the converter to 15 bit (8 pixel | |
14 ; conversion routine from the mmx programming tutorial pages). | |
15 ; | |
16 ; | |
17 ; These routines aren't exactly pII optimised - it's just that as they | |
18 ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to | |
19 ; optimise them for p5 MMXs.. | |
20 | |
21 BITS 32 | |
22 | |
23 | |
24 GLOBAL _ConvertMMXpII32_24RGB888 | |
25 GLOBAL _ConvertMMXpII32_16RGB565 | |
26 GLOBAL _ConvertMMXpII32_16BGR565 | |
27 GLOBAL _ConvertMMXpII32_16RGB555 | |
28 GLOBAL _ConvertMMXpII32_16BGR555 | |
29 | |
30 EXTERN _mmxreturn | |
31 | |
1166
da33b7e6d181
Date: Tue, 1 Nov 2005 20:25:10 +0100
Sam Lantinga <slouken@libsdl.org>
parents:
289
diff
changeset
|
32 SECTION .note.GNU-stack noalloc progbits noexec nowrite |
0 | 33 SECTION .data |
34 | |
35 ALIGN 8 | |
36 | |
37 ;; Constants for conversion routines | |
38 | |
39 mmx32_rgb888_mask dd 00ffffffh,00ffffffh | |
40 | |
41 mmx32_rgb565_b dd 000000f8h, 000000f8h | |
42 mmx32_rgb565_g dd 0000fc00h, 0000fc00h | |
43 mmx32_rgb565_r dd 00f80000h, 00f80000h | |
44 | |
45 mmx32_rgb555_rb dd 00f800f8h,00f800f8h | |
46 mmx32_rgb555_g dd 0000f800h,0000f800h | |
47 mmx32_rgb555_mul dd 20000008h,20000008h | |
48 mmx32_bgr555_mul dd 00082000h,00082000h | |
49 | |
50 | |
51 | |
52 SECTION .text | |
53 | |
54 _ConvertMMXpII32_24RGB888: | |
55 | |
56 ; set up mm6 as the mask, mm7 as zero | |
57 movq mm6, qword [mmx32_rgb888_mask] | |
58 pxor mm7, mm7 | |
59 | |
60 mov edx, ecx ; save ecx | |
61 and ecx, 0fffffffch ; clear lower two bits | |
62 jnz .L1 | |
63 jmp .L2 | |
64 | |
65 .L1: | |
66 | |
67 movq mm0, [esi] ; A R G B a r g b | |
68 pand mm0, mm6 ; 0 R G B 0 r g b | |
69 movq mm1, [esi+8] ; A R G B a r g b | |
70 pand mm1, mm6 ; 0 R G B 0 r g b | |
71 | |
72 movq mm2, mm0 ; 0 R G B 0 r g b | |
73 punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B | |
74 punpckldq mm0, mm7 ; 0 0 0 0 0 r g b | |
75 psllq mm2, 24 ; 0 0 R G B 0 0 0 | |
76 por mm0, mm2 ; 0 0 R G B r g b | |
77 | |
78 movq mm3, mm1 ; 0 R G B 0 r g b | |
79 psllq mm3, 48 ; g b 0 0 0 0 0 0 | |
80 por mm0, mm3 ; g b R G B r g b | |
81 | |
82 movq mm4, mm1 ; 0 R G B 0 r g b | |
83 punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B | |
84 punpckldq mm1, mm7 ; 0 0 0 0 0 r g b | |
85 psrlq mm1, 16 ; 0 0 0 R G B 0 r | |
86 psllq mm4, 8 ; 0 0 0 0 R G B 0 | |
87 por mm1, mm4 ; 0 0 0 0 R G B r | |
88 | |
89 movq [edi], mm0 | |
90 add esi, BYTE 16 | |
91 movd [edi+8], mm1 | |
92 add edi, BYTE 12 | |
93 sub ecx, BYTE 4 | |
94 jnz .L1 | |
95 | |
96 .L2: | |
97 mov ecx, edx | |
98 and ecx, BYTE 3 | |
99 jz .L4 | |
100 .L3: | |
101 mov al, [esi] | |
102 mov bl, [esi+1] | |
103 mov dl, [esi+2] | |
104 mov [edi], al | |
105 mov [edi+1], bl | |
106 mov [edi+2], dl | |
107 add esi, BYTE 4 | |
108 add edi, BYTE 3 | |
109 dec ecx | |
110 jnz .L3 | |
111 .L4: | |
112 jmp _mmxreturn | |
113 | |
114 | |
115 | |
116 _ConvertMMXpII32_16RGB565: | |
117 | |
118 ; set up masks | |
119 movq mm5, [mmx32_rgb565_b] | |
120 movq mm6, [mmx32_rgb565_g] | |
121 movq mm7, [mmx32_rgb565_r] | |
122 | |
123 mov edx, ecx | |
124 shr ecx, 2 | |
125 jnz .L1 | |
126 jmp .L2 ; not necessary at the moment, but doesn't hurt (much) | |
127 | |
128 .L1: | |
129 movq mm0, [esi] ; argb | |
130 movq mm1, mm0 ; argb | |
131 pand mm0, mm6 ; 00g0 | |
132 movq mm3, mm1 ; argb | |
133 pand mm1, mm5 ; 000b | |
134 pand mm3, mm7 ; 0r00 | |
135 pslld mm1, 2 ; 0 0 000000bb bbb00000 | |
136 por mm0, mm1 ; 0 0 ggggggbb bbb00000 | |
137 psrld mm0, 5 ; 0 0 00000ggg gggbbbbb | |
138 | |
139 movq mm4, [esi+8] ; argb | |
140 movq mm2, mm4 ; argb | |
141 pand mm4, mm6 ; 00g0 | |
142 movq mm1, mm2 ; argb | |
143 pand mm2, mm5 ; 000b | |
144 pand mm1, mm7 ; 0r00 | |
145 pslld mm2, 2 ; 0 0 000000bb bbb00000 | |
146 por mm4, mm2 ; 0 0 ggggggbb bbb00000 | |
147 psrld mm4, 5 ; 0 0 00000ggg gggbbbbb | |
148 | |
149 packuswb mm3, mm1 ; R 0 r 0 | |
150 packssdw mm0, mm4 ; as above.. ish | |
151 por mm0, mm3 ; done. | |
152 movq [edi], mm0 | |
153 | |
154 add esi, 16 | |
155 add edi, 8 | |
156 dec ecx | |
157 jnz .L1 | |
158 | |
159 .L2: | |
160 mov ecx, edx | |
161 and ecx, BYTE 3 | |
162 jz .L4 | |
163 .L3: | |
164 mov al, [esi] | |
165 mov bh, [esi+1] | |
166 mov ah, [esi+2] | |
167 shr al, 3 | |
168 and eax, 0F81Fh ; BYTE? | |
169 shr ebx, 5 | |
170 and ebx, 07E0h ; BYTE? | |
171 add eax, ebx | |
172 mov [edi], al | |
173 mov [edi+1], ah | |
174 add esi, BYTE 4 | |
175 add edi, BYTE 2 | |
176 dec ecx | |
177 jnz .L3 | |
178 | |
179 .L4: | |
180 jmp _mmxreturn | |
181 | |
182 | |
183 _ConvertMMXpII32_16BGR565: | |
184 | |
185 movq mm5, [mmx32_rgb565_r] | |
186 movq mm6, [mmx32_rgb565_g] | |
187 movq mm7, [mmx32_rgb565_b] | |
188 | |
189 mov edx, ecx | |
190 shr ecx, 2 | |
191 jnz .L1 | |
192 jmp .L2 | |
193 | |
194 .L1: | |
195 movq mm0, [esi] ; a r g b | |
196 movq mm1, mm0 ; a r g b | |
197 pand mm0, mm6 ; 0 0 g 0 | |
198 movq mm3, mm1 ; a r g b | |
199 pand mm1, mm5 ; 0 r 0 0 | |
200 pand mm3, mm7 ; 0 0 0 b | |
201 | |
202 psllq mm3, 16 ; 0 b 0 0 | |
203 psrld mm1, 14 ; 0 0 000000rr rrr00000 | |
204 por mm0, mm1 ; 0 0 ggggggrr rrr00000 | |
205 psrld mm0, 5 ; 0 0 00000ggg gggrrrrr | |
206 | |
207 movq mm4, [esi+8] ; a r g b | |
208 movq mm2, mm4 ; a r g b | |
209 pand mm4, mm6 ; 0 0 g 0 | |
210 movq mm1, mm2 ; a r g b | |
211 pand mm2, mm5 ; 0 r 0 0 | |
212 pand mm1, mm7 ; 0 0 0 b | |
213 | |
214 psllq mm1, 16 ; 0 b 0 0 | |
215 psrld mm2, 14 ; 0 0 000000rr rrr00000 | |
216 por mm4, mm2 ; 0 0 ggggggrr rrr00000 | |
217 psrld mm4, 5 ; 0 0 00000ggg gggrrrrr | |
218 | |
219 packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 | |
220 packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR | |
221 por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr | |
222 movq [edi], mm0 | |
223 | |
224 add esi, BYTE 16 | |
225 add edi, BYTE 8 | |
226 dec ecx | |
227 jnz .L1 | |
228 | |
229 .L2: | |
230 and edx, BYTE 3 | |
231 jz .L4 | |
232 .L3: | |
233 mov al, [esi+2] | |
234 mov bh, [esi+1] | |
235 mov ah, [esi] | |
236 shr al, 3 | |
237 and eax, 0F81Fh ; BYTE ? | |
238 shr ebx, 5 | |
239 and ebx, 07E0h ; BYTE ? | |
240 add eax, ebx | |
241 mov [edi], al | |
242 mov [edi+1], ah | |
243 add esi, BYTE 4 | |
244 add edi, BYTE 2 | |
245 dec edx | |
246 jnz .L3 | |
247 | |
248 .L4: | |
249 jmp _mmxreturn | |
250 | |
251 _ConvertMMXpII32_16BGR555: | |
252 | |
253 ; the 16BGR555 converter is identical to the RGB555 one, | |
254 ; except it uses a different multiplier for the pmaddwd | |
255 ; instruction. cool huh. | |
256 | |
257 movq mm7, qword [mmx32_bgr555_mul] | |
258 jmp _convert_bgr555_cheat | |
259 | |
260 ; This is the same as the Intel version.. they obviously went to | |
261 ; much more trouble to expand/coil the loop than I did, so theirs | |
262 ; would almost certainly be faster, even if only a little. | |
263 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is | |
264 ; (I think) a more accurate name.. | |
265 _ConvertMMXpII32_16RGB555: | |
266 | |
267 movq mm7,qword [mmx32_rgb555_mul] | |
268 _convert_bgr555_cheat: | |
269 movq mm6,qword [mmx32_rgb555_g] | |
270 | |
271 mov edx,ecx ; Save ecx | |
272 | |
273 and ecx,BYTE 0fffffff8h ; clear lower three bits | |
274 jnz .L_OK | |
289
77b6110c797d
Fixed "short jump out of range" error in MMX code
Sam Lantinga <slouken@libsdl.org>
parents:
0
diff
changeset
|
275 jmp near .L2 |
0 | 276 |
277 .L_OK: | |
278 | |
279 movq mm2,[esi+8] | |
280 | |
281 movq mm0,[esi] | |
282 movq mm3,mm2 | |
283 | |
284 pand mm3,qword [mmx32_rgb555_rb] | |
285 movq mm1,mm0 | |
286 | |
287 pand mm1,qword [mmx32_rgb555_rb] | |
288 pmaddwd mm3,mm7 | |
289 | |
290 pmaddwd mm1,mm7 | |
291 pand mm2,mm6 | |
292 | |
293 .L1: | |
294 movq mm4,[esi+24] | |
295 pand mm0,mm6 | |
296 | |
297 movq mm5,[esi+16] | |
298 por mm3,mm2 | |
299 | |
300 psrld mm3,6 | |
301 por mm1,mm0 | |
302 | |
303 movq mm0,mm4 | |
304 psrld mm1,6 | |
305 | |
306 pand mm0,qword [mmx32_rgb555_rb] | |
307 packssdw mm1,mm3 | |
308 | |
309 movq mm3,mm5 | |
310 pmaddwd mm0,mm7 | |
311 | |
312 pand mm3,qword [mmx32_rgb555_rb] | |
313 pand mm4,mm6 | |
314 | |
315 movq [edi],mm1 | |
316 pmaddwd mm3,mm7 | |
317 | |
318 add esi,BYTE 32 | |
319 por mm4,mm0 | |
320 | |
321 pand mm5,mm6 | |
322 psrld mm4,6 | |
323 | |
324 movq mm2,[esi+8] | |
325 por mm5,mm3 | |
326 | |
327 movq mm0,[esi] | |
328 psrld mm5,6 | |
329 | |
330 movq mm3,mm2 | |
331 movq mm1,mm0 | |
332 | |
333 pand mm3,qword [mmx32_rgb555_rb] | |
334 packssdw mm5,mm4 | |
335 | |
336 pand mm1,qword [mmx32_rgb555_rb] | |
337 pand mm2,mm6 | |
338 | |
339 movq [edi+8],mm5 | |
340 pmaddwd mm3,mm7 | |
341 | |
342 pmaddwd mm1,mm7 | |
343 add edi,BYTE 16 | |
344 | |
345 sub ecx,BYTE 8 | |
346 jz .L2 | |
347 jmp .L1 | |
348 | |
349 | |
350 .L2: | |
351 mov ecx,edx | |
352 | |
353 and ecx,BYTE 7 | |
354 jz .L4 | |
355 | |
356 .L3: | |
357 mov ebx,[esi] | |
358 add esi,BYTE 4 | |
359 | |
360 mov eax,ebx | |
361 mov edx,ebx | |
362 | |
363 shr eax,3 | |
364 shr edx,6 | |
365 | |
366 and eax,BYTE 0000000000011111b | |
367 and edx, 0000001111100000b | |
368 | |
369 shr ebx,9 | |
370 | |
371 or eax,edx | |
372 | |
373 and ebx, 0111110000000000b | |
374 | |
375 or eax,ebx | |
376 | |
377 mov [edi],ax | |
378 add edi,BYTE 2 | |
379 | |
380 dec ecx | |
381 jnz .L3 | |
382 | |
383 .L4: | |
384 jmp _mmxreturn | |
385 | |
386 | |
387 |