0
|
1 ;
|
|
2 ; pII-optimised MMX format converters for HERMES
|
|
3 ; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
|
|
4 ; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
|
|
5 ; This source code is licensed under the GNU LGPL
|
|
6 ;
|
|
7 ; Please refer to the file COPYING.LIB contained in the distribution for
|
|
8 ; licensing conditions
|
|
9 ;
|
|
10 ; COPYRIGHT NOTICE
|
|
11 ;
|
|
12 ; This file partly contains code that is (c) Intel Corporation, specifically
|
|
13 ; the mode detection routine, and the converter to 15 bit (8 pixel
|
|
14 ; conversion routine from the mmx programming tutorial pages).
|
|
15 ;
|
|
16 ;
|
|
17 ; These routines aren't exactly pII optimised - it's just that as they
|
|
18 ; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to
|
|
19 ; optimise them for p5 MMXs..
|
|
20
|
|
21 BITS 32
|
|
22
|
|
23
|
|
24 GLOBAL _ConvertMMXpII32_24RGB888
|
|
25 GLOBAL _ConvertMMXpII32_16RGB565
|
|
26 GLOBAL _ConvertMMXpII32_16BGR565
|
|
27 GLOBAL _ConvertMMXpII32_16RGB555
|
|
28 GLOBAL _ConvertMMXpII32_16BGR555
|
|
29
|
|
30 EXTERN _mmxreturn
|
|
31
|
|
32 SECTION .data
|
|
33
|
|
34 ALIGN 8
|
|
35
|
|
36 ;; Constants for conversion routines
|
|
37
|
|
38 mmx32_rgb888_mask dd 00ffffffh,00ffffffh
|
|
39
|
|
40 mmx32_rgb565_b dd 000000f8h, 000000f8h
|
|
41 mmx32_rgb565_g dd 0000fc00h, 0000fc00h
|
|
42 mmx32_rgb565_r dd 00f80000h, 00f80000h
|
|
43
|
|
44 mmx32_rgb555_rb dd 00f800f8h,00f800f8h
|
|
45 mmx32_rgb555_g dd 0000f800h,0000f800h
|
|
46 mmx32_rgb555_mul dd 20000008h,20000008h
|
|
47 mmx32_bgr555_mul dd 00082000h,00082000h
|
|
48
|
|
49
|
|
50
|
|
51 SECTION .text
|
|
52
|
|
53 _ConvertMMXpII32_24RGB888:
|
|
54
|
|
55 ; set up mm6 as the mask, mm7 as zero
|
|
56 movq mm6, qword [mmx32_rgb888_mask]
|
|
57 pxor mm7, mm7
|
|
58
|
|
59 mov edx, ecx ; save ecx
|
|
60 and ecx, 0fffffffch ; clear lower two bits
|
|
61 jnz .L1
|
|
62 jmp .L2
|
|
63
|
|
64 .L1:
|
|
65
|
|
66 movq mm0, [esi] ; A R G B a r g b
|
|
67 pand mm0, mm6 ; 0 R G B 0 r g b
|
|
68 movq mm1, [esi+8] ; A R G B a r g b
|
|
69 pand mm1, mm6 ; 0 R G B 0 r g b
|
|
70
|
|
71 movq mm2, mm0 ; 0 R G B 0 r g b
|
|
72 punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B
|
|
73 punpckldq mm0, mm7 ; 0 0 0 0 0 r g b
|
|
74 psllq mm2, 24 ; 0 0 R G B 0 0 0
|
|
75 por mm0, mm2 ; 0 0 R G B r g b
|
|
76
|
|
77 movq mm3, mm1 ; 0 R G B 0 r g b
|
|
78 psllq mm3, 48 ; g b 0 0 0 0 0 0
|
|
79 por mm0, mm3 ; g b R G B r g b
|
|
80
|
|
81 movq mm4, mm1 ; 0 R G B 0 r g b
|
|
82 punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B
|
|
83 punpckldq mm1, mm7 ; 0 0 0 0 0 r g b
|
|
84 psrlq mm1, 16 ; 0 0 0 R G B 0 r
|
|
85 psllq mm4, 8 ; 0 0 0 0 R G B 0
|
|
86 por mm1, mm4 ; 0 0 0 0 R G B r
|
|
87
|
|
88 movq [edi], mm0
|
|
89 add esi, BYTE 16
|
|
90 movd [edi+8], mm1
|
|
91 add edi, BYTE 12
|
|
92 sub ecx, BYTE 4
|
|
93 jnz .L1
|
|
94
|
|
95 .L2:
|
|
96 mov ecx, edx
|
|
97 and ecx, BYTE 3
|
|
98 jz .L4
|
|
99 .L3:
|
|
100 mov al, [esi]
|
|
101 mov bl, [esi+1]
|
|
102 mov dl, [esi+2]
|
|
103 mov [edi], al
|
|
104 mov [edi+1], bl
|
|
105 mov [edi+2], dl
|
|
106 add esi, BYTE 4
|
|
107 add edi, BYTE 3
|
|
108 dec ecx
|
|
109 jnz .L3
|
|
110 .L4:
|
|
111 jmp _mmxreturn
|
|
112
|
|
113
|
|
114
|
|
115 _ConvertMMXpII32_16RGB565:
|
|
116
|
|
117 ; set up masks
|
|
118 movq mm5, [mmx32_rgb565_b]
|
|
119 movq mm6, [mmx32_rgb565_g]
|
|
120 movq mm7, [mmx32_rgb565_r]
|
|
121
|
|
122 mov edx, ecx
|
|
123 shr ecx, 2
|
|
124 jnz .L1
|
|
125 jmp .L2 ; not necessary at the moment, but doesn't hurt (much)
|
|
126
|
|
127 .L1:
|
|
128 movq mm0, [esi] ; argb
|
|
129 movq mm1, mm0 ; argb
|
|
130 pand mm0, mm6 ; 00g0
|
|
131 movq mm3, mm1 ; argb
|
|
132 pand mm1, mm5 ; 000b
|
|
133 pand mm3, mm7 ; 0r00
|
|
134 pslld mm1, 2 ; 0 0 000000bb bbb00000
|
|
135 por mm0, mm1 ; 0 0 ggggggbb bbb00000
|
|
136 psrld mm0, 5 ; 0 0 00000ggg gggbbbbb
|
|
137
|
|
138 movq mm4, [esi+8] ; argb
|
|
139 movq mm2, mm4 ; argb
|
|
140 pand mm4, mm6 ; 00g0
|
|
141 movq mm1, mm2 ; argb
|
|
142 pand mm2, mm5 ; 000b
|
|
143 pand mm1, mm7 ; 0r00
|
|
144 pslld mm2, 2 ; 0 0 000000bb bbb00000
|
|
145 por mm4, mm2 ; 0 0 ggggggbb bbb00000
|
|
146 psrld mm4, 5 ; 0 0 00000ggg gggbbbbb
|
|
147
|
|
148 packuswb mm3, mm1 ; R 0 r 0
|
|
149 packssdw mm0, mm4 ; as above.. ish
|
|
150 por mm0, mm3 ; done.
|
|
151 movq [edi], mm0
|
|
152
|
|
153 add esi, 16
|
|
154 add edi, 8
|
|
155 dec ecx
|
|
156 jnz .L1
|
|
157
|
|
158 .L2:
|
|
159 mov ecx, edx
|
|
160 and ecx, BYTE 3
|
|
161 jz .L4
|
|
162 .L3:
|
|
163 mov al, [esi]
|
|
164 mov bh, [esi+1]
|
|
165 mov ah, [esi+2]
|
|
166 shr al, 3
|
|
167 and eax, 0F81Fh ; BYTE?
|
|
168 shr ebx, 5
|
|
169 and ebx, 07E0h ; BYTE?
|
|
170 add eax, ebx
|
|
171 mov [edi], al
|
|
172 mov [edi+1], ah
|
|
173 add esi, BYTE 4
|
|
174 add edi, BYTE 2
|
|
175 dec ecx
|
|
176 jnz .L3
|
|
177
|
|
178 .L4:
|
|
179 jmp _mmxreturn
|
|
180
|
|
181
|
|
182 _ConvertMMXpII32_16BGR565:
|
|
183
|
|
184 movq mm5, [mmx32_rgb565_r]
|
|
185 movq mm6, [mmx32_rgb565_g]
|
|
186 movq mm7, [mmx32_rgb565_b]
|
|
187
|
|
188 mov edx, ecx
|
|
189 shr ecx, 2
|
|
190 jnz .L1
|
|
191 jmp .L2
|
|
192
|
|
193 .L1:
|
|
194 movq mm0, [esi] ; a r g b
|
|
195 movq mm1, mm0 ; a r g b
|
|
196 pand mm0, mm6 ; 0 0 g 0
|
|
197 movq mm3, mm1 ; a r g b
|
|
198 pand mm1, mm5 ; 0 r 0 0
|
|
199 pand mm3, mm7 ; 0 0 0 b
|
|
200
|
|
201 psllq mm3, 16 ; 0 b 0 0
|
|
202 psrld mm1, 14 ; 0 0 000000rr rrr00000
|
|
203 por mm0, mm1 ; 0 0 ggggggrr rrr00000
|
|
204 psrld mm0, 5 ; 0 0 00000ggg gggrrrrr
|
|
205
|
|
206 movq mm4, [esi+8] ; a r g b
|
|
207 movq mm2, mm4 ; a r g b
|
|
208 pand mm4, mm6 ; 0 0 g 0
|
|
209 movq mm1, mm2 ; a r g b
|
|
210 pand mm2, mm5 ; 0 r 0 0
|
|
211 pand mm1, mm7 ; 0 0 0 b
|
|
212
|
|
213 psllq mm1, 16 ; 0 b 0 0
|
|
214 psrld mm2, 14 ; 0 0 000000rr rrr00000
|
|
215 por mm4, mm2 ; 0 0 ggggggrr rrr00000
|
|
216 psrld mm4, 5 ; 0 0 00000ggg gggrrrrr
|
|
217
|
|
218 packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000
|
|
219 packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
|
|
220 por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
|
|
221 movq [edi], mm0
|
|
222
|
|
223 add esi, BYTE 16
|
|
224 add edi, BYTE 8
|
|
225 dec ecx
|
|
226 jnz .L1
|
|
227
|
|
228 .L2:
|
|
229 and edx, BYTE 3
|
|
230 jz .L4
|
|
231 .L3:
|
|
232 mov al, [esi+2]
|
|
233 mov bh, [esi+1]
|
|
234 mov ah, [esi]
|
|
235 shr al, 3
|
|
236 and eax, 0F81Fh ; BYTE ?
|
|
237 shr ebx, 5
|
|
238 and ebx, 07E0h ; BYTE ?
|
|
239 add eax, ebx
|
|
240 mov [edi], al
|
|
241 mov [edi+1], ah
|
|
242 add esi, BYTE 4
|
|
243 add edi, BYTE 2
|
|
244 dec edx
|
|
245 jnz .L3
|
|
246
|
|
247 .L4:
|
|
248 jmp _mmxreturn
|
|
249
|
|
250 _ConvertMMXpII32_16BGR555:
|
|
251
|
|
252 ; the 16BGR555 converter is identical to the RGB555 one,
|
|
253 ; except it uses a different multiplier for the pmaddwd
|
|
254 ; instruction. cool huh.
|
|
255
|
|
256 movq mm7, qword [mmx32_bgr555_mul]
|
|
257 jmp _convert_bgr555_cheat
|
|
258
|
|
259 ; This is the same as the Intel version.. they obviously went to
|
|
260 ; much more trouble to expand/coil the loop than I did, so theirs
|
|
261 ; would almost certainly be faster, even if only a little.
|
|
262 ; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
|
|
263 ; (I think) a more accurate name..
|
|
264 _ConvertMMXpII32_16RGB555:
|
|
265
|
|
266 movq mm7,qword [mmx32_rgb555_mul]
|
|
267 _convert_bgr555_cheat:
|
|
268 movq mm6,qword [mmx32_rgb555_g]
|
|
269
|
|
270 mov edx,ecx ; Save ecx
|
|
271
|
|
272 and ecx,BYTE 0fffffff8h ; clear lower three bits
|
|
273 jnz .L_OK
|
|
274 jmp .L2
|
|
275
|
|
276 .L_OK:
|
|
277
|
|
278 movq mm2,[esi+8]
|
|
279
|
|
280 movq mm0,[esi]
|
|
281 movq mm3,mm2
|
|
282
|
|
283 pand mm3,qword [mmx32_rgb555_rb]
|
|
284 movq mm1,mm0
|
|
285
|
|
286 pand mm1,qword [mmx32_rgb555_rb]
|
|
287 pmaddwd mm3,mm7
|
|
288
|
|
289 pmaddwd mm1,mm7
|
|
290 pand mm2,mm6
|
|
291
|
|
292 .L1:
|
|
293 movq mm4,[esi+24]
|
|
294 pand mm0,mm6
|
|
295
|
|
296 movq mm5,[esi+16]
|
|
297 por mm3,mm2
|
|
298
|
|
299 psrld mm3,6
|
|
300 por mm1,mm0
|
|
301
|
|
302 movq mm0,mm4
|
|
303 psrld mm1,6
|
|
304
|
|
305 pand mm0,qword [mmx32_rgb555_rb]
|
|
306 packssdw mm1,mm3
|
|
307
|
|
308 movq mm3,mm5
|
|
309 pmaddwd mm0,mm7
|
|
310
|
|
311 pand mm3,qword [mmx32_rgb555_rb]
|
|
312 pand mm4,mm6
|
|
313
|
|
314 movq [edi],mm1
|
|
315 pmaddwd mm3,mm7
|
|
316
|
|
317 add esi,BYTE 32
|
|
318 por mm4,mm0
|
|
319
|
|
320 pand mm5,mm6
|
|
321 psrld mm4,6
|
|
322
|
|
323 movq mm2,[esi+8]
|
|
324 por mm5,mm3
|
|
325
|
|
326 movq mm0,[esi]
|
|
327 psrld mm5,6
|
|
328
|
|
329 movq mm3,mm2
|
|
330 movq mm1,mm0
|
|
331
|
|
332 pand mm3,qword [mmx32_rgb555_rb]
|
|
333 packssdw mm5,mm4
|
|
334
|
|
335 pand mm1,qword [mmx32_rgb555_rb]
|
|
336 pand mm2,mm6
|
|
337
|
|
338 movq [edi+8],mm5
|
|
339 pmaddwd mm3,mm7
|
|
340
|
|
341 pmaddwd mm1,mm7
|
|
342 add edi,BYTE 16
|
|
343
|
|
344 sub ecx,BYTE 8
|
|
345 jz .L2
|
|
346 jmp .L1
|
|
347
|
|
348
|
|
349 .L2:
|
|
350 mov ecx,edx
|
|
351
|
|
352 and ecx,BYTE 7
|
|
353 jz .L4
|
|
354
|
|
355 .L3:
|
|
356 mov ebx,[esi]
|
|
357 add esi,BYTE 4
|
|
358
|
|
359 mov eax,ebx
|
|
360 mov edx,ebx
|
|
361
|
|
362 shr eax,3
|
|
363 shr edx,6
|
|
364
|
|
365 and eax,BYTE 0000000000011111b
|
|
366 and edx, 0000001111100000b
|
|
367
|
|
368 shr ebx,9
|
|
369
|
|
370 or eax,edx
|
|
371
|
|
372 and ebx, 0111110000000000b
|
|
373
|
|
374 or eax,ebx
|
|
375
|
|
376 mov [edi],ax
|
|
377 add edi,BYTE 2
|
|
378
|
|
379 dec ecx
|
|
380 jnz .L3
|
|
381
|
|
382 .L4:
|
|
383 jmp _mmxreturn
|
|
384
|
|
385
|
|
386
|