comparison src/hermes/mmxp2_32.asm @ 0:74212992fb08

Initial revision
author Sam Lantinga <slouken@lokigames.com>
date Thu, 26 Apr 2001 16:45:43 +0000
parents
children 77b6110c797d
comparison
equal deleted inserted replaced
-1:000000000000 0:74212992fb08
1 ;
2 ; pII-optimised MMX format converters for HERMES
3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
4 ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
5 ; This source code is licensed under the GNU LGPL
6 ;
7 ; Please refer to the file COPYING.LIB contained in the distribution for
8 ; licensing conditions
9 ;
10 ; COPYRIGHT NOTICE
11 ;
12 ; This file partly contains code that is (c) Intel Corporation, specifically
13 ; the mode detection routine, and the converter to 15 bit (8 pixel
14 ; conversion routine from the mmx programming tutorial pages).
15 ;
16 ;
17 ; These routines aren't exactly pII optimised - it's just that as they
18 ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
19 ; optimise them for p5 MMXs..
20
21 BITS 32
22
23
24 GLOBAL _ConvertMMXpII32_24RGB888
25 GLOBAL _ConvertMMXpII32_16RGB565
26 GLOBAL _ConvertMMXpII32_16BGR565
27 GLOBAL _ConvertMMXpII32_16RGB555
28 GLOBAL _ConvertMMXpII32_16BGR555
29
30 EXTERN _mmxreturn
31
32 SECTION .data
33
34 ALIGN 8
35
36 ;; Constants for conversion routines
37
38 mmx32_rgb888_mask dd 00ffffffh,00ffffffh
39
40 mmx32_rgb565_b dd 000000f8h, 000000f8h
41 mmx32_rgb565_g dd 0000fc00h, 0000fc00h
42 mmx32_rgb565_r dd 00f80000h, 00f80000h
43
44 mmx32_rgb555_rb dd 00f800f8h,00f800f8h
45 mmx32_rgb555_g dd 0000f800h,0000f800h
46 mmx32_rgb555_mul dd 20000008h,20000008h
47 mmx32_bgr555_mul dd 00082000h,00082000h
48
49
50
51 SECTION .text
52
53 _ConvertMMXpII32_24RGB888:
54
55 ; set up mm6 as the mask, mm7 as zero
56 movq mm6, qword [mmx32_rgb888_mask]
57 pxor mm7, mm7
58
59 mov edx, ecx ; save ecx
60 and ecx, 0fffffffch ; clear lower two bits
61 jnz .L1
62 jmp .L2
63
64 .L1:
65
66 movq mm0, [esi] ; A R G B a r g b
67 pand mm0, mm6 ; 0 R G B 0 r g b
68 movq mm1, [esi+8] ; A R G B a r g b
69 pand mm1, mm6 ; 0 R G B 0 r g b
70
71 movq mm2, mm0 ; 0 R G B 0 r g b
72 punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
73 punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
74 psllq mm2, 24 ; 0 0 R G B 0 0 0
75 por mm0, mm2 ; 0 0 R G B r g b
76
77 movq mm3, mm1 ; 0 R G B 0 r g b
78 psllq mm3, 48 ; g b 0 0 0 0 0 0
79 por mm0, mm3 ; g b R G B r g b
80
81 movq mm4, mm1 ; 0 R G B 0 r g b
82 punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
83 punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
84 psrlq mm1, 16 ; 0 0 0 R G B 0 r
85 psllq mm4, 8 ; 0 0 0 0 R G B 0
86 por mm1, mm4 ; 0 0 0 0 R G B r
87
88 movq [edi], mm0
89 add esi, BYTE 16
90 movd [edi+8], mm1
91 add edi, BYTE 12
92 sub ecx, BYTE 4
93 jnz .L1
94
95 .L2:
96 mov ecx, edx
97 and ecx, BYTE 3
98 jz .L4
99 .L3:
100 mov al, [esi]
101 mov bl, [esi+1]
102 mov dl, [esi+2]
103 mov [edi], al
104 mov [edi+1], bl
105 mov [edi+2], dl
106 add esi, BYTE 4
107 add edi, BYTE 3
108 dec ecx
109 jnz .L3
110 .L4:
111 jmp _mmxreturn
112
113
114
115 _ConvertMMXpII32_16RGB565:
116
117 ; set up masks
118 movq mm5, [mmx32_rgb565_b]
119 movq mm6, [mmx32_rgb565_g]
120 movq mm7, [mmx32_rgb565_r]
121
122 mov edx, ecx
123 shr ecx, 2
124 jnz .L1
125 jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
126
127 .L1:
128 movq mm0, [esi] ; argb
129 movq mm1, mm0 ; argb
130 pand mm0, mm6 ; 00g0
131 movq mm3, mm1 ; argb
132 pand mm1, mm5 ; 000b
133 pand mm3, mm7 ; 0r00
134 pslld mm1, 2 ; 0 0 000000bb bbb00000
135 por mm0, mm1 ; 0 0 ggggggbb bbb00000
136 psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
137
138 movq mm4, [esi+8] ; argb
139 movq mm2, mm4 ; argb
140 pand mm4, mm6 ; 00g0
141 movq mm1, mm2 ; argb
142 pand mm2, mm5 ; 000b
143 pand mm1, mm7 ; 0r00
144 pslld mm2, 2 ; 0 0 000000bb bbb00000
145 por mm4, mm2 ; 0 0 ggggggbb bbb00000
146 psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
147
148 packuswb mm3, mm1 ; R 0 r 0
149 packssdw mm0, mm4 ; as above.. ish
150 por mm0, mm3 ; done.
151 movq [edi], mm0
152
153 add esi, 16
154 add edi, 8
155 dec ecx
156 jnz .L1
157
158 .L2:
159 mov ecx, edx
160 and ecx, BYTE 3
161 jz .L4
162 .L3:
163 mov al, [esi]
164 mov bh, [esi+1]
165 mov ah, [esi+2]
166 shr al, 3
167 and eax, 0F81Fh ; BYTE?
168 shr ebx, 5
169 and ebx, 07E0h ; BYTE?
170 add eax, ebx
171 mov [edi], al
172 mov [edi+1], ah
173 add esi, BYTE 4
174 add edi, BYTE 2
175 dec ecx
176 jnz .L3
177
178 .L4:
179 jmp _mmxreturn
180
181
182 _ConvertMMXpII32_16BGR565:
183
184 movq mm5, [mmx32_rgb565_r]
185 movq mm6, [mmx32_rgb565_g]
186 movq mm7, [mmx32_rgb565_b]
187
188 mov edx, ecx
189 shr ecx, 2
190 jnz .L1
191 jmp .L2
192
193 .L1:
194 movq mm0, [esi] ; a r g b
195 movq mm1, mm0 ; a r g b
196 pand mm0, mm6 ; 0 0 g 0
197 movq mm3, mm1 ; a r g b
198 pand mm1, mm5 ; 0 r 0 0
199 pand mm3, mm7 ; 0 0 0 b
200
201 psllq mm3, 16 ; 0 b 0 0
202 psrld mm1, 14 ; 0 0 000000rr rrr00000
203 por mm0, mm1 ; 0 0 ggggggrr rrr00000
204 psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
205
206 movq mm4, [esi+8] ; a r g b
207 movq mm2, mm4 ; a r g b
208 pand mm4, mm6 ; 0 0 g 0
209 movq mm1, mm2 ; a r g b
210 pand mm2, mm5 ; 0 r 0 0
211 pand mm1, mm7 ; 0 0 0 b
212
213 psllq mm1, 16 ; 0 b 0 0
214 psrld mm2, 14 ; 0 0 000000rr rrr00000
215 por mm4, mm2 ; 0 0 ggggggrr rrr00000
216 psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
217
218 packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
219 packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
220 por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
221 movq [edi], mm0
222
223 add esi, BYTE 16
224 add edi, BYTE 8
225 dec ecx
226 jnz .L1
227
228 .L2:
229 and edx, BYTE 3
230 jz .L4
231 .L3:
232 mov al, [esi+2]
233 mov bh, [esi+1]
234 mov ah, [esi]
235 shr al, 3
236 and eax, 0F81Fh ; BYTE ?
237 shr ebx, 5
238 and ebx, 07E0h ; BYTE ?
239 add eax, ebx
240 mov [edi], al
241 mov [edi+1], ah
242 add esi, BYTE 4
243 add edi, BYTE 2
244 dec edx
245 jnz .L3
246
247 .L4:
248 jmp _mmxreturn
249
250 _ConvertMMXpII32_16BGR555:
251
252 ; the 16BGR555 converter is identical to the RGB555 one,
253 ; except it uses a different multiplier for the pmaddwd
254 ; instruction. cool huh.
255
256 movq mm7, qword [mmx32_bgr555_mul]
257 jmp _convert_bgr555_cheat
258
259 ; This is the same as the Intel version.. they obviously went to
260 ; much more trouble to expand/coil the loop than I did, so theirs
261 ; would almost certainly be faster, even if only a little.
262 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
263 ; (I think) a more accurate name..
264 _ConvertMMXpII32_16RGB555:
265
266 movq mm7,qword [mmx32_rgb555_mul]
267 _convert_bgr555_cheat:
268 movq mm6,qword [mmx32_rgb555_g]
269
270 mov edx,ecx ; Save ecx
271
272 and ecx,BYTE 0fffffff8h ; clear lower three bits
273 jnz .L_OK
274 jmp .L2
275
276 .L_OK:
277
278 movq mm2,[esi+8]
279
280 movq mm0,[esi]
281 movq mm3,mm2
282
283 pand mm3,qword [mmx32_rgb555_rb]
284 movq mm1,mm0
285
286 pand mm1,qword [mmx32_rgb555_rb]
287 pmaddwd mm3,mm7
288
289 pmaddwd mm1,mm7
290 pand mm2,mm6
291
292 .L1:
293 movq mm4,[esi+24]
294 pand mm0,mm6
295
296 movq mm5,[esi+16]
297 por mm3,mm2
298
299 psrld mm3,6
300 por mm1,mm0
301
302 movq mm0,mm4
303 psrld mm1,6
304
305 pand mm0,qword [mmx32_rgb555_rb]
306 packssdw mm1,mm3
307
308 movq mm3,mm5
309 pmaddwd mm0,mm7
310
311 pand mm3,qword [mmx32_rgb555_rb]
312 pand mm4,mm6
313
314 movq [edi],mm1
315 pmaddwd mm3,mm7
316
317 add esi,BYTE 32
318 por mm4,mm0
319
320 pand mm5,mm6
321 psrld mm4,6
322
323 movq mm2,[esi+8]
324 por mm5,mm3
325
326 movq mm0,[esi]
327 psrld mm5,6
328
329 movq mm3,mm2
330 movq mm1,mm0
331
332 pand mm3,qword [mmx32_rgb555_rb]
333 packssdw mm5,mm4
334
335 pand mm1,qword [mmx32_rgb555_rb]
336 pand mm2,mm6
337
338 movq [edi+8],mm5
339 pmaddwd mm3,mm7
340
341 pmaddwd mm1,mm7
342 add edi,BYTE 16
343
344 sub ecx,BYTE 8
345 jz .L2
346 jmp .L1
347
348
349 .L2:
350 mov ecx,edx
351
352 and ecx,BYTE 7
353 jz .L4
354
355 .L3:
356 mov ebx,[esi]
357 add esi,BYTE 4
358
359 mov eax,ebx
360 mov edx,ebx
361
362 shr eax,3
363 shr edx,6
364
365 and eax,BYTE 0000000000011111b
366 and edx, 0000001111100000b
367
368 shr ebx,9
369
370 or eax,edx
371
372 and ebx, 0111110000000000b
373
374 or eax,ebx
375
376 mov [edi],ax
377 add edi,BYTE 2
378
379 dec ecx
380 jnz .L3
381
382 .L4:
383 jmp _mmxreturn
384
385
386