Mercurial > sdl-ios-xcode
annotate src/hermes/mmxp2_32.asm @ 1166:da33b7e6d181
Date: Tue, 1 Nov 2005 20:25:10 +0100
From: Dirk Mueller
Subject: [PATCH] build SDL with nonexecutable stack
libSDL is by default marked with an executable stack, which it doesn't
actually need. the reason for this is that there are assembler files in the
source tree not properly annotated with the "noexec stack" section. As such
the linker does a safe-fallback and marks the whole lib as "requires
executable stack".
the patch below removes this by adding annotations. As far as I can see it
shouldn't break anything.
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Tue, 01 Nov 2005 23:19:59 +0000 |
parents | 77b6110c797d |
children | 2d6dc7de1145 |
rev | line source |
---|---|
0 | 1 ; |
2 ; pII-optimised MMX format converters for HERMES | |
3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) | |
4 ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) | |
5 ; This source code is licensed under the GNU LGPL | |
6 ; | |
7 ; Please refer to the file COPYING.LIB contained in the distribution for | |
8 ; licensing conditions | |
9 ; | |
10 ; COPYRIGHT NOTICE | |
11 ; | |
12 ; This file partly contains code that is (c) Intel Corporation, specifically | |
13 ; the mode detection routine, and the converter to 15 bit (8 pixel | |
14 ; conversion routine from the mmx programming tutorial pages). | |
15 ; | |
16 ; | |
17 ; These routines aren't exactly pII optimised - it's just that as they | |
18 ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to | |
19 ; optimise them for p5 MMXs.. | |
20 | |
21 BITS 32 | |
22 | |
23 | |
24 GLOBAL _ConvertMMXpII32_24RGB888 | |
25 GLOBAL _ConvertMMXpII32_16RGB565 | |
26 GLOBAL _ConvertMMXpII32_16BGR565 | |
27 GLOBAL _ConvertMMXpII32_16RGB555 | |
28 GLOBAL _ConvertMMXpII32_16BGR555 | |
29 | |
30 EXTERN _mmxreturn | |
31 | |
1166
da33b7e6d181
Date: Tue, 1 Nov 2005 20:25:10 +0100
Sam Lantinga <slouken@libsdl.org>
parents:
289
diff
changeset
|
32 SECTION .note.GNU-stack noalloc progbits noexec nowrite |
0 | 33 SECTION .data |
34 | |
35 ALIGN 8 | |
36 | |
37 ;; Constants for conversion routines | |
38 | |
39 mmx32_rgb888_mask dd 00ffffffh,00ffffffh | |
40 | |
41 mmx32_rgb565_b dd 000000f8h, 000000f8h | |
42 mmx32_rgb565_g dd 0000fc00h, 0000fc00h | |
43 mmx32_rgb565_r dd 00f80000h, 00f80000h | |
44 | |
45 mmx32_rgb555_rb dd 00f800f8h,00f800f8h | |
46 mmx32_rgb555_g dd 0000f800h,0000f800h | |
47 mmx32_rgb555_mul dd 20000008h,20000008h | |
48 mmx32_bgr555_mul dd 00082000h,00082000h | |
49 | |
50 | |
51 | |
52 SECTION .text | |
53 | |
54 _ConvertMMXpII32_24RGB888: | |
55 | |
56 ; set up mm6 as the mask, mm7 as zero | |
57 movq mm6, qword [mmx32_rgb888_mask] | |
58 pxor mm7, mm7 | |
59 | |
60 mov edx, ecx ; save ecx | |
61 and ecx, 0fffffffch ; clear lower two bits | |
62 jnz .L1 | |
63 jmp .L2 | |
64 | |
65 .L1: | |
66 | |
67 movq mm0, [esi] ; A R G B a r g b | |
68 pand mm0, mm6 ; 0 R G B 0 r g b | |
69 movq mm1, [esi+8] ; A R G B a r g b | |
70 pand mm1, mm6 ; 0 R G B 0 r g b | |
71 | |
72 movq mm2, mm0 ; 0 R G B 0 r g b | |
73 punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B | |
74 punpckldq mm0, mm7 ; 0 0 0 0 0 r g b | |
75 psllq mm2, 24 ; 0 0 R G B 0 0 0 | |
76 por mm0, mm2 ; 0 0 R G B r g b | |
77 | |
78 movq mm3, mm1 ; 0 R G B 0 r g b | |
79 psllq mm3, 48 ; g b 0 0 0 0 0 0 | |
80 por mm0, mm3 ; g b R G B r g b | |
81 | |
82 movq mm4, mm1 ; 0 R G B 0 r g b | |
83 punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B | |
84 punpckldq mm1, mm7 ; 0 0 0 0 0 r g b | |
85 psrlq mm1, 16 ; 0 0 0 R G B 0 r | |
86 psllq mm4, 8 ; 0 0 0 0 R G B 0 | |
87 por mm1, mm4 ; 0 0 0 0 R G B r | |
88 | |
89 movq [edi], mm0 | |
90 add esi, BYTE 16 | |
91 movd [edi+8], mm1 | |
92 add edi, BYTE 12 | |
93 sub ecx, BYTE 4 | |
94 jnz .L1 | |
95 | |
96 .L2: | |
97 mov ecx, edx | |
98 and ecx, BYTE 3 | |
99 jz .L4 | |
100 .L3: | |
101 mov al, [esi] | |
102 mov bl, [esi+1] | |
103 mov dl, [esi+2] | |
104 mov [edi], al | |
105 mov [edi+1], bl | |
106 mov [edi+2], dl | |
107 add esi, BYTE 4 | |
108 add edi, BYTE 3 | |
109 dec ecx | |
110 jnz .L3 | |
111 .L4: | |
112 jmp _mmxreturn | |
113 | |
114 | |
115 | |
116 _ConvertMMXpII32_16RGB565: | |
117 | |
118 ; set up masks | |
119 movq mm5, [mmx32_rgb565_b] | |
120 movq mm6, [mmx32_rgb565_g] | |
121 movq mm7, [mmx32_rgb565_r] | |
122 | |
123 mov edx, ecx | |
124 shr ecx, 2 | |
125 jnz .L1 | |
126 jmp .L2 ; not necessary at the moment, but doesn't hurt (much) | |
127 | |
128 .L1: | |
129 movq mm0, [esi] ; argb | |
130 movq mm1, mm0 ; argb | |
131 pand mm0, mm6 ; 00g0 | |
132 movq mm3, mm1 ; argb | |
133 pand mm1, mm5 ; 000b | |
134 pand mm3, mm7 ; 0r00 | |
135 pslld mm1, 2 ; 0 0 000000bb bbb00000 | |
136 por mm0, mm1 ; 0 0 ggggggbb bbb00000 | |
137 psrld mm0, 5 ; 0 0 00000ggg gggbbbbb | |
138 | |
139 movq mm4, [esi+8] ; argb | |
140 movq mm2, mm4 ; argb | |
141 pand mm4, mm6 ; 00g0 | |
142 movq mm1, mm2 ; argb | |
143 pand mm2, mm5 ; 000b | |
144 pand mm1, mm7 ; 0r00 | |
145 pslld mm2, 2 ; 0 0 000000bb bbb00000 | |
146 por mm4, mm2 ; 0 0 ggggggbb bbb00000 | |
147 psrld mm4, 5 ; 0 0 00000ggg gggbbbbb | |
148 | |
149 packuswb mm3, mm1 ; R 0 r 0 | |
150 packssdw mm0, mm4 ; as above.. ish | |
151 por mm0, mm3 ; done. | |
152 movq [edi], mm0 | |
153 | |
154 add esi, 16 | |
155 add edi, 8 | |
156 dec ecx | |
157 jnz .L1 | |
158 | |
159 .L2: | |
160 mov ecx, edx | |
161 and ecx, BYTE 3 | |
162 jz .L4 | |
163 .L3: | |
164 mov al, [esi] | |
165 mov bh, [esi+1] | |
166 mov ah, [esi+2] | |
167 shr al, 3 | |
168 and eax, 0F81Fh ; BYTE? | |
169 shr ebx, 5 | |
170 and ebx, 07E0h ; BYTE? | |
171 add eax, ebx | |
172 mov [edi], al | |
173 mov [edi+1], ah | |
174 add esi, BYTE 4 | |
175 add edi, BYTE 2 | |
176 dec ecx | |
177 jnz .L3 | |
178 | |
179 .L4: | |
180 jmp _mmxreturn | |
181 | |
182 | |
183 _ConvertMMXpII32_16BGR565: | |
184 | |
185 movq mm5, [mmx32_rgb565_r] | |
186 movq mm6, [mmx32_rgb565_g] | |
187 movq mm7, [mmx32_rgb565_b] | |
188 | |
189 mov edx, ecx | |
190 shr ecx, 2 | |
191 jnz .L1 | |
192 jmp .L2 | |
193 | |
194 .L1: | |
195 movq mm0, [esi] ; a r g b | |
196 movq mm1, mm0 ; a r g b | |
197 pand mm0, mm6 ; 0 0 g 0 | |
198 movq mm3, mm1 ; a r g b | |
199 pand mm1, mm5 ; 0 r 0 0 | |
200 pand mm3, mm7 ; 0 0 0 b | |
201 | |
202 psllq mm3, 16 ; 0 b 0 0 | |
203 psrld mm1, 14 ; 0 0 000000rr rrr00000 | |
204 por mm0, mm1 ; 0 0 ggggggrr rrr00000 | |
205 psrld mm0, 5 ; 0 0 00000ggg gggrrrrr | |
206 | |
207 movq mm4, [esi+8] ; a r g b | |
208 movq mm2, mm4 ; a r g b | |
209 pand mm4, mm6 ; 0 0 g 0 | |
210 movq mm1, mm2 ; a r g b | |
211 pand mm2, mm5 ; 0 r 0 0 | |
212 pand mm1, mm7 ; 0 0 0 b | |
213 | |
214 psllq mm1, 16 ; 0 b 0 0 | |
215 psrld mm2, 14 ; 0 0 000000rr rrr00000 | |
216 por mm4, mm2 ; 0 0 ggggggrr rrr00000 | |
217 psrld mm4, 5 ; 0 0 00000ggg gggrrrrr | |
218 | |
219 packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 | |
220 packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR | |
221 por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr | |
222 movq [edi], mm0 | |
223 | |
224 add esi, BYTE 16 | |
225 add edi, BYTE 8 | |
226 dec ecx | |
227 jnz .L1 | |
228 | |
229 .L2: | |
230 and edx, BYTE 3 | |
231 jz .L4 | |
232 .L3: | |
233 mov al, [esi+2] | |
234 mov bh, [esi+1] | |
235 mov ah, [esi] | |
236 shr al, 3 | |
237 and eax, 0F81Fh ; BYTE ? | |
238 shr ebx, 5 | |
239 and ebx, 07E0h ; BYTE ? | |
240 add eax, ebx | |
241 mov [edi], al | |
242 mov [edi+1], ah | |
243 add esi, BYTE 4 | |
244 add edi, BYTE 2 | |
245 dec edx | |
246 jnz .L3 | |
247 | |
248 .L4: | |
249 jmp _mmxreturn | |
250 | |
251 _ConvertMMXpII32_16BGR555: | |
252 | |
253 ; the 16BGR555 converter is identical to the RGB555 one, | |
254 ; except it uses a different multiplier for the pmaddwd | |
255 ; instruction. cool huh. | |
256 | |
257 movq mm7, qword [mmx32_bgr555_mul] | |
258 jmp _convert_bgr555_cheat | |
259 | |
260 ; This is the same as the Intel version.. they obviously went to | |
261 ; much more trouble to expand/coil the loop than I did, so theirs | |
262 ; would almost certainly be faster, even if only a little. | |
263 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is | |
264 ; (I think) a more accurate name.. | |
265 _ConvertMMXpII32_16RGB555: | |
266 | |
267 movq mm7,qword [mmx32_rgb555_mul] | |
268 _convert_bgr555_cheat: | |
269 movq mm6,qword [mmx32_rgb555_g] | |
270 | |
271 mov edx,ecx ; Save ecx | |
272 | |
273 and ecx,BYTE 0fffffff8h ; clear lower three bits | |
274 jnz .L_OK | |
289
77b6110c797d
Fixed "short jump out of range" error in MMX code
Sam Lantinga <slouken@libsdl.org>
parents:
0
diff
changeset
|
275 jmp near .L2 |
0 | 276 |
277 .L_OK: | |
278 | |
279 movq mm2,[esi+8] | |
280 | |
281 movq mm0,[esi] | |
282 movq mm3,mm2 | |
283 | |
284 pand mm3,qword [mmx32_rgb555_rb] | |
285 movq mm1,mm0 | |
286 | |
287 pand mm1,qword [mmx32_rgb555_rb] | |
288 pmaddwd mm3,mm7 | |
289 | |
290 pmaddwd mm1,mm7 | |
291 pand mm2,mm6 | |
292 | |
293 .L1: | |
294 movq mm4,[esi+24] | |
295 pand mm0,mm6 | |
296 | |
297 movq mm5,[esi+16] | |
298 por mm3,mm2 | |
299 | |
300 psrld mm3,6 | |
301 por mm1,mm0 | |
302 | |
303 movq mm0,mm4 | |
304 psrld mm1,6 | |
305 | |
306 pand mm0,qword [mmx32_rgb555_rb] | |
307 packssdw mm1,mm3 | |
308 | |
309 movq mm3,mm5 | |
310 pmaddwd mm0,mm7 | |
311 | |
312 pand mm3,qword [mmx32_rgb555_rb] | |
313 pand mm4,mm6 | |
314 | |
315 movq [edi],mm1 | |
316 pmaddwd mm3,mm7 | |
317 | |
318 add esi,BYTE 32 | |
319 por mm4,mm0 | |
320 | |
321 pand mm5,mm6 | |
322 psrld mm4,6 | |
323 | |
324 movq mm2,[esi+8] | |
325 por mm5,mm3 | |
326 | |
327 movq mm0,[esi] | |
328 psrld mm5,6 | |
329 | |
330 movq mm3,mm2 | |
331 movq mm1,mm0 | |
332 | |
333 pand mm3,qword [mmx32_rgb555_rb] | |
334 packssdw mm5,mm4 | |
335 | |
336 pand mm1,qword [mmx32_rgb555_rb] | |
337 pand mm2,mm6 | |
338 | |
339 movq [edi+8],mm5 | |
340 pmaddwd mm3,mm7 | |
341 | |
342 pmaddwd mm1,mm7 | |
343 add edi,BYTE 16 | |
344 | |
345 sub ecx,BYTE 8 | |
346 jz .L2 | |
347 jmp .L1 | |
348 | |
349 | |
350 .L2: | |
351 mov ecx,edx | |
352 | |
353 and ecx,BYTE 7 | |
354 jz .L4 | |
355 | |
356 .L3: | |
357 mov ebx,[esi] | |
358 add esi,BYTE 4 | |
359 | |
360 mov eax,ebx | |
361 mov edx,ebx | |
362 | |
363 shr eax,3 | |
364 shr edx,6 | |
365 | |
366 and eax,BYTE 0000000000011111b | |
367 and edx, 0000001111100000b | |
368 | |
369 shr ebx,9 | |
370 | |
371 or eax,edx | |
372 | |
373 and ebx, 0111110000000000b | |
374 | |
375 or eax,ebx | |
376 | |
377 mov [edi],ax | |
378 add edi,BYTE 2 | |
379 | |
380 dec ecx | |
381 jnz .L3 | |
382 | |
383 .L4: | |
384 jmp _mmxreturn | |
385 | |
386 | |
387 |