diff src/hermes/x86p_32.asm @ 0:74212992fb08

Initial revision
author Sam Lantinga <slouken@lokigames.com>
date Thu, 26 Apr 2001 16:45:43 +0000
parents
children da33b7e6d181
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hermes/x86p_32.asm	Thu Apr 26 16:45:43 2001 +0000
@@ -0,0 +1,1043 @@
+;
+; x86 format converters for HERMES
+; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
+; This source code is licensed under the GNU LGPL
+; 
+; Please refer to the file COPYING.LIB contained in the distribution for
+; licensing conditions		
+;
+; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
+; 
+
+	
+BITS 32
+
+GLOBAL _ConvertX86p32_32BGR888
+GLOBAL _ConvertX86p32_32RGBA888
+GLOBAL _ConvertX86p32_32BGRA888
+GLOBAL _ConvertX86p32_24RGB888	
+GLOBAL _ConvertX86p32_24BGR888
+GLOBAL _ConvertX86p32_16RGB565
+GLOBAL _ConvertX86p32_16BGR565
+GLOBAL _ConvertX86p32_16RGB555
+GLOBAL _ConvertX86p32_16BGR555
+GLOBAL _ConvertX86p32_8RGB332
+
+EXTERN _x86return
+		
+SECTION .text
+
+
+;; _Convert_*
+;; Paramters:	
+;;   ESI = source 
+;;   EDI = dest
+;;   ECX = amount (NOT 0!!! (the _ConvertX86 routine checks for that though))
+;; Destroys:
+;;   EAX, EBX, EDX
+
+
+_ConvertX86p32_32BGR888:
+
+    ; check short
+    cmp ecx,BYTE 32
+    ja .L3
+
+.L1 ; short loop
+    mov edx,[esi]
+    bswap edx
+    ror edx,8
+    mov [edi],edx
+    add esi,BYTE 4
+    add edi,BYTE 4
+    dec ecx
+    jnz .L1
+.L2
+    jmp _x86return
+
+.L3 ; save ebp
+    push ebp
+
+    ; unroll four times
+    mov ebp,ecx
+    shr ebp,2
+    
+    ; save count
+    push ecx
+
+.L4     mov eax,[esi]
+        mov ebx,[esi+4]
+
+        bswap eax
+
+        bswap ebx
+
+        ror eax,8
+        mov ecx,[esi+8]
+
+        ror ebx,8
+        mov edx,[esi+12]
+
+        bswap ecx
+
+        bswap edx
+
+        ror ecx,8
+        mov [edi+0],eax
+
+        ror edx,8
+        mov [edi+4],ebx
+
+        mov [edi+8],ecx
+        mov [edi+12],edx
+
+        add esi,BYTE 16
+        add edi,BYTE 16
+
+        dec ebp
+        jnz .L4                 
+
+    ; check tail
+    pop ecx
+    and ecx,BYTE 11b
+    jz .L6
+
+.L5 ; tail loop
+    mov edx,[esi]
+    bswap edx
+    ror edx,8
+    mov [edi],edx
+    add esi,BYTE 4
+    add edi,BYTE 4
+    dec ecx
+    jnz .L5
+
+.L6 pop ebp
+    jmp _x86return
+	
+
+	
+		
+_ConvertX86p32_32RGBA888:
+	
+    ; check short
+    cmp ecx,BYTE 32
+    ja .L3
+
+.L1 ; short loop
+    mov edx,[esi]
+    rol edx,8
+    mov [edi],edx
+    add esi,BYTE 4
+    add edi,BYTE 4
+    dec ecx
+    jnz .L1
+.L2
+    jmp _x86return
+
+.L3 ; save ebp
+    push ebp
+
+    ; unroll four times
+    mov ebp,ecx
+    shr ebp,2
+    
+    ; save count
+    push ecx
+
+.L4     mov eax,[esi]
+        mov ebx,[esi+4]
+
+        rol eax,8
+        mov ecx,[esi+8]
+
+        rol ebx,8
+        mov edx,[esi+12]
+
+        rol ecx,8
+        mov [edi+0],eax
+
+        rol edx,8
+        mov [edi+4],ebx
+
+        mov [edi+8],ecx
+        mov [edi+12],edx
+
+        add esi,BYTE 16
+        add edi,BYTE 16
+
+        dec ebp
+        jnz .L4                 
+
+    ; check tail
+    pop ecx
+    and ecx,BYTE 11b
+    jz .L6
+
+.L5 ; tail loop
+    mov edx,[esi]
+    rol edx,8
+    mov [edi],edx
+    add esi,BYTE 4
+    add edi,BYTE 4
+    dec ecx
+    jnz .L5
+
+.L6 pop ebp
+    jmp _x86return
+
+	
+
+
+_ConvertX86p32_32BGRA888:
+
+    ; check short
+    cmp ecx,BYTE 32
+    ja .L3
+
+.L1 ; short loop
+    mov edx,[esi]
+    bswap edx
+    mov [edi],edx
+    add esi,BYTE 4
+    add edi,BYTE 4
+    dec ecx
+    jnz .L1
+.L2
+    jmp _x86return
+
+.L3 ; save ebp
+    push ebp
+
+    ; unroll four times
+    mov ebp,ecx
+    shr ebp,2
+    
+    ; save count
+    push ecx
+
+.L4     mov eax,[esi]
+        mov ebx,[esi+4]
+
+        mov ecx,[esi+8]
+        mov edx,[esi+12]
+
+        bswap eax
+
+        bswap ebx
+
+        bswap ecx
+
+        bswap edx
+
+        mov [edi+0],eax
+        mov [edi+4],ebx
+
+        mov [edi+8],ecx
+        mov [edi+12],edx
+
+        add esi,BYTE 16
+        add edi,BYTE 16
+
+        dec ebp
+        jnz .L4                 
+
+    ; check tail
+    pop ecx
+    and ecx,BYTE 11b
+    jz .L6
+
+.L5 ; tail loop
+    mov edx,[esi]
+    bswap edx
+    mov [edi],edx
+    add esi,BYTE 4
+    add edi,BYTE 4
+    dec ecx
+    jnz .L5
+
+.L6 pop ebp
+    jmp _x86return
+
+
+	
+	
+;; 32 bit RGB 888 to 24 BIT RGB 888
+
+_ConvertX86p32_24RGB888:
+
+	; check short
+	cmp ecx,BYTE 32
+	ja .L3
+
+.L1	; short loop
+	mov al,[esi]
+	mov bl,[esi+1]
+	mov dl,[esi+2]
+	mov [edi],al
+	mov [edi+1],bl
+	mov [edi+2],dl
+	add esi,BYTE 4
+	add edi,BYTE 3
+	dec ecx
+	jnz .L1
+.L2 
+	jmp _x86return
+
+.L3	;	 head
+	mov edx,edi
+	and edx,BYTE 11b
+	jz .L4
+	mov al,[esi]
+	mov bl,[esi+1]
+	mov dl,[esi+2]
+	mov [edi],al
+	mov [edi+1],bl
+	mov [edi+2],dl
+	add esi,BYTE 4
+	add edi,BYTE 3
+	dec ecx
+	jmp SHORT .L3
+
+.L4 ; unroll 4 times
+	push ebp
+	mov ebp,ecx
+	shr ebp,2
+
+    ; save count
+	push ecx
+
+.L5     mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
+        mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]
+
+        shl eax,8                       ;                        eax = [R][G][B][.]
+        mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]
+
+        shl ebx,8                       ;                        ebx = [r][g][b][.]
+        mov al,[esi+4]                  ;                        eax = [R][G][B][b]
+
+        ror eax,8                       ;                        eax = [b][R][G][B] (done)
+        mov bh,[esi+8+1]                ;                        ebx = [r][g][G][.]
+
+        mov [edi],eax
+        add edi,BYTE 3*4
+
+        shl ecx,8                       ;                        ecx = [r][g][b][.]
+        mov bl,[esi+8+0]                ;                        ebx = [r][g][G][B]
+
+        rol ebx,16                      ;                        ebx = [G][B][r][g] (done)
+        mov cl,[esi+8+2]                ;                        ecx = [r][g][b][R] (done)
+
+        mov [edi+4-3*4],ebx
+        add esi,BYTE 4*4
+        
+        mov [edi+8-3*4],ecx
+        dec ebp
+
+        jnz .L5
+
+    ; check tail
+	pop ecx
+	and ecx,BYTE 11b
+	jz .L7
+
+.L6 ; tail loop
+	mov al,[esi]
+	mov bl,[esi+1]
+	mov dl,[esi+2]
+	mov [edi],al
+	mov [edi+1],bl
+	mov [edi+2],dl
+	add esi,BYTE 4
+	add edi,BYTE 3
+	dec ecx
+	jnz .L6
+
+.L7	pop ebp
+	jmp _x86return
+
+
+
+
+;; 32 bit RGB 888 to 24 bit BGR 888
+
+_ConvertX86p32_24BGR888:
+
+	; check short
+	cmp ecx,BYTE 32
+	ja .L3
+
+	
+.L1	; short loop
+	mov dl,[esi]
+	mov bl,[esi+1]
+	mov al,[esi+2]
+	mov [edi],al
+	mov [edi+1],bl
+	mov [edi+2],dl
+	add esi,BYTE 4
+	add edi,BYTE 3
+	dec ecx
+	jnz .L1
+.L2
+	jmp _x86return
+
+.L3 ; head
+	mov edx,edi
+	and edx,BYTE 11b
+	jz .L4
+	mov dl,[esi]
+	mov bl,[esi+1]
+	mov al,[esi+2]
+	mov [edi],al
+	mov [edi+1],bl
+	mov [edi+2],dl
+	add esi,BYTE 4
+	add edi,BYTE 3
+	dec ecx
+	jmp SHORT .L3
+
+.L4	; unroll 4 times
+	push ebp
+	mov ebp,ecx
+	shr ebp,2
+
+	; save count
+	push ecx
+
+.L5     
+	mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
+        mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]
+        
+        bswap eax                       ;                        eax = [B][G][R][A]
+
+        bswap ebx                       ;                        ebx = [b][g][r][a]
+
+        mov al,[esi+4+2]                ;                        eax = [B][G][R][r] 
+        mov bh,[esi+4+4+1]              ;                        ebx = [b][g][G][a]
+
+        ror eax,8                       ;                        eax = [r][B][G][R] (done)
+        mov bl,[esi+4+4+2]              ;                        ebx = [b][g][G][R]
+
+        ror ebx,16                      ;                        ebx = [G][R][b][g] (done)
+        mov [edi],eax
+    
+        mov [edi+4],ebx
+        mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]
+        
+        bswap ecx                       ;                        ecx = [b][g][r][a]
+        
+        mov cl,[esi+8]                  ;                        ecx = [b][g][r][B] (done)
+        add esi,BYTE 4*4
+
+        mov [edi+8],ecx
+        add edi,BYTE 3*4
+
+        dec ebp
+        jnz .L5
+
+	; check tail
+	pop ecx
+	and ecx,BYTE 11b
+	jz .L7
+
+.L6	; tail loop
+	mov dl,[esi]
+	mov bl,[esi+1]
+	mov al,[esi+2]
+	mov [edi],al
+	mov [edi+1],bl
+	mov [edi+2],dl
+	add esi,BYTE 4
+	add edi,BYTE 3
+	dec ecx
+	jnz .L6
+
+.L7 
+	pop ebp
+	jmp _x86return
+ 
+
+	
+		
+;; 32 bit RGB 888 to 16 BIT RGB 565 
+
+_ConvertX86p32_16RGB565:
+	; check short
+	cmp ecx,BYTE 16
+	ja .L3
+
+.L1 ; short loop
+	mov bl,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov ah,[esi+2]    ; red
+	shr ah,3
+        and al,11111100b
+	shl eax,3
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+	dec ecx
+	jnz .L1
+
+.L2:				; End of short loop
+	jmp _x86return
+
+	
+.L3	; head
+	mov ebx,edi
+	and ebx,BYTE 11b
+	jz .L4
+	
+	mov bl,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov ah,[esi+2]    ; red
+	shr ah,3
+	and al,11111100b
+	shl eax,3
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+	dec ecx
+
+.L4:	 
+    ; save count
+	push ecx
+
+    ; unroll twice
+	shr ecx,1
+    
+    ; point arrays to end
+	lea esi,[esi+ecx*8]
+	lea edi,[edi+ecx*4]
+
+    ; negative counter 
+	neg ecx
+	jmp SHORT .L6
+
+.L5:	    
+	mov [edi+ecx*4-4],eax
+.L6:	
+	mov eax,[esi+ecx*8]
+
+        shr ah,2
+        mov ebx,[esi+ecx*8+4]
+
+        shr eax,3
+        mov edx,[esi+ecx*8+4]
+
+        shr bh,2
+        mov dl,[esi+ecx*8+2]
+
+        shl ebx,13
+        and eax,000007FFh
+        
+        shl edx,8
+        and ebx,07FF0000h
+
+        and edx,0F800F800h
+        add eax,ebx
+
+        add eax,edx
+        inc ecx
+
+        jnz .L5                 
+
+	mov [edi+ecx*4-4],eax
+
+    ; tail
+	pop ecx
+	test cl,1
+	jz .L7
+	
+	mov bl,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov ah,[esi+2]    ; red
+	shr ah,3
+	and al,11111100b
+	shl eax,3
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+
+.L7:	
+	jmp _x86return
+
+
+
+	
+;; 32 bit RGB 888 to 16 BIT BGR 565 
+
+_ConvertX86p32_16BGR565:
+	
+	; check short
+	cmp ecx,BYTE 16
+	ja .L3
+
+.L1	; short loop
+	mov ah,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov bl,[esi+2]    ; red
+	shr ah,3
+	and al,11111100b
+	shl eax,3
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+	dec ecx
+	jnz .L1
+.L2
+	jmp _x86return
+
+.L3	; head
+	mov ebx,edi
+	and ebx,BYTE 11b
+	jz .L4   
+	mov ah,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov bl,[esi+2]    ; red
+	shr ah,3
+	and al,11111100b
+	shl eax,3
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+	dec ecx
+
+.L4	; save count
+	push ecx
+
+	; unroll twice
+	shr ecx,1
+    
+	; point arrays to end
+	lea esi,[esi+ecx*8]
+	lea edi,[edi+ecx*4]
+
+	; negative count
+	neg ecx
+	jmp SHORT .L6
+
+.L5     
+	mov [edi+ecx*4-4],eax            
+.L6     
+	mov edx,[esi+ecx*8+4]
+
+        mov bh,[esi+ecx*8+4]                       
+        mov ah,[esi+ecx*8]                       
+
+        shr bh,3
+        mov al,[esi+ecx*8+1]             
+
+        shr ah,3
+        mov bl,[esi+ecx*8+5]           
+
+        shl eax,3
+        mov dl,[esi+ecx*8+2]
+
+        shl ebx,19
+        and eax,0000FFE0h              
+                
+        shr edx,3
+        and ebx,0FFE00000h             
+        
+        and edx,001F001Fh               
+        add eax,ebx
+
+        add eax,edx
+        inc ecx
+
+        jnz .L5                 
+
+	mov [edi+ecx*4-4],eax            
+
+	; tail
+	pop ecx
+	and ecx,BYTE 1
+	jz .L7
+	mov ah,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov bl,[esi+2]    ; red
+	shr ah,3
+	and al,11111100b
+	shl eax,3
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+
+.L7 
+	jmp _x86return
+
+
+	
+	
+;; 32 BIT RGB TO 16 BIT RGB 555
+
+_ConvertX86p32_16RGB555:
+
+	; check short
+	cmp ecx,BYTE 16
+	ja .L3
+
+.L1	; short loop
+	mov bl,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov ah,[esi+2]    ; red
+	shr ah,3
+	and al,11111000b
+	shl eax,2
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+	dec ecx
+	jnz .L1
+.L2
+	jmp _x86return
+
+.L3	; head
+	mov ebx,edi
+        and ebx,BYTE 11b
+	jz .L4   
+	mov bl,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov ah,[esi+2]    ; red
+	shr ah,3
+	and al,11111000b
+	shl eax,2
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+	dec ecx
+
+.L4	; save count
+	push ecx
+
+	; unroll twice
+	shr ecx,1
+    
+	; point arrays to end
+	lea esi,[esi+ecx*8]
+	lea edi,[edi+ecx*4]
+
+	; negative counter 
+	neg ecx
+	jmp SHORT .L6
+
+.L5     
+	mov [edi+ecx*4-4],eax
+.L6     
+	mov eax,[esi+ecx*8]
+
+        shr ah,3
+        mov ebx,[esi+ecx*8+4]
+
+        shr eax,3
+        mov edx,[esi+ecx*8+4]
+
+        shr bh,3
+        mov dl,[esi+ecx*8+2]
+
+        shl ebx,13
+        and eax,000007FFh
+        
+        shl edx,7
+        and ebx,07FF0000h
+
+        and edx,07C007C00h
+        add eax,ebx
+
+        add eax,edx
+        inc ecx
+
+        jnz .L5                 
+
+	mov [edi+ecx*4-4],eax
+
+	; tail
+	pop ecx
+	and ecx,BYTE 1
+	jz .L7
+	mov bl,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov ah,[esi+2]    ; red
+	shr ah,3
+	and al,11111000b
+	shl eax,2
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+
+.L7
+	jmp _x86return
+
+
+
+
+;; 32 BIT RGB TO 16 BIT BGR 555
+	
+_ConvertX86p32_16BGR555:
+	
+	; check short
+	cmp ecx,BYTE 16
+	ja .L3
+
+
+.L1	; short loop
+	mov ah,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov bl,[esi+2]    ; red
+	shr ah,3
+	and al,11111000b
+	shl eax,2
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+	dec ecx
+	jnz .L1
+.L2 
+	jmp _x86return
+
+.L3	; head
+	mov ebx,edi
+        and ebx,BYTE 11b
+	jz .L4   
+	mov ah,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov bl,[esi+2]    ; red
+	shr ah,3
+	and al,11111000b
+	shl eax,2
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+	dec ecx
+
+.L4	; save count
+	push ecx
+
+	; unroll twice
+	shr ecx,1
+    
+	; point arrays to end
+	lea esi,[esi+ecx*8]
+	lea edi,[edi+ecx*4]
+
+	; negative counter 
+	neg ecx
+	jmp SHORT .L6
+
+.L5     
+	mov [edi+ecx*4-4],eax            
+.L6     
+	mov edx,[esi+ecx*8+4]
+
+        mov bh,[esi+ecx*8+4]                       
+        mov ah,[esi+ecx*8]                       
+
+        shr bh,3
+        mov al,[esi+ecx*8+1]             
+
+        shr ah,3
+        mov bl,[esi+ecx*8+5]           
+
+        shl eax,2
+        mov dl,[esi+ecx*8+2]
+
+        shl ebx,18
+        and eax,00007FE0h              
+                
+        shr edx,3
+        and ebx,07FE00000h             
+        
+        and edx,001F001Fh               
+        add eax,ebx
+
+        add eax,edx
+        inc ecx
+
+        jnz .L5                 
+
+	mov [edi+ecx*4-4],eax            
+
+	; tail
+	pop ecx
+	and ecx,BYTE 1
+	jz .L7
+	mov ah,[esi+0]    ; blue
+	mov al,[esi+1]    ; green
+	mov bl,[esi+2]    ; red
+	shr ah,3
+	and al,11111000b
+	shl eax,2
+	shr bl,3
+	add al,bl
+	mov [edi+0],al
+	mov [edi+1],ah
+	add esi,BYTE 4
+	add edi,BYTE 2
+
+.L7
+	jmp _x86return
+
+
+
+
+	
+;; FROM 32 BIT RGB to 8 BIT RGB (rrrgggbbb)
+;; This routine writes FOUR pixels at once (dword) and then, if they exist
+;; the trailing three pixels
+_ConvertX86p32_8RGB332:
+
+	
+.L_ALIGNED
+	push ecx
+
+	shr ecx,2		; We will draw 4 pixels at once
+	jnz .L1
+	
+	jmp .L2			; short jump out of range :(
+	
+.L1:
+	mov eax,[esi]		; first pair of pixels
+	mov edx,[esi+4]
+
+	shr dl,6
+	mov ebx,eax
+
+	shr al,6
+	and ah,0e0h
+
+	shr ebx,16
+	and dh,0e0h
+	
+	shr ah,3
+	and bl,0e0h
+
+	shr dh,3
+	
+	or al,bl
+	
+	mov ebx,edx	
+	or al,ah
+	
+	shr ebx,16
+	or dl,dh
+
+	and bl,0e0h
+	
+	or dl,bl
+
+	mov ah,dl
+
+	
+		
+	mov ebx,[esi+8]		; second pair of pixels
+
+	mov edx,ebx
+	and bh,0e0h
+
+	shr bl,6
+	and edx,0e00000h
+
+	shr edx,16
+
+	shr bh,3
+
+	ror eax,16
+	or bl,dl
+
+	mov edx,[esi+12]
+	or bl,bh
+	
+	mov al,bl
+
+	mov ebx,edx
+	and dh,0e0h
+
+	shr dl,6
+	and ebx,0e00000h
+	
+	shr dh,3
+	mov ah,dl
+
+	shr ebx,16
+	or ah,dh
+
+	or ah,bl
+
+	rol eax,16
+	add esi,BYTE 16
+			
+	mov [edi],eax	
+	add edi,BYTE 4
+	
+	dec ecx
+	jz .L2			; L1 out of range for short jump :(
+	
+	jmp .L1
+.L2:
+	
+	pop ecx
+	and ecx,BYTE 3		; mask out number of pixels to draw
+	
+	jz .L4			; Nothing to do anymore
+
+.L3:
+	mov eax,[esi]		; single pixel conversion for trailing pixels
+
+        mov ebx,eax
+
+        shr al,6
+        and ah,0e0h
+
+        shr ebx,16
+
+        shr ah,3
+        and bl,0e0h
+
+        or al,ah
+        or al,bl
+
+        mov [edi],al
+
+        inc edi
+        add esi,BYTE 4
+
+	dec ecx
+	jnz .L3
+	
+.L4:	
+	jmp _x86return