view src/hermes/x86p_32.asm @ 1173:e9cf8c1b4590

Split up src/SDL_loadso.c into platform directories.
author Ryan C. Gordon <icculus@icculus.org>
date Thu, 17 Nov 2005 03:15:05 +0000
parents da33b7e6d181
children 2d6dc7de1145
line wrap: on
line source

;
; x86 format converters for HERMES
; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
; This source code is licensed under the GNU LGPL
; 
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions		
;
; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
; 

	
BITS 32

GLOBAL _ConvertX86p32_32BGR888
GLOBAL _ConvertX86p32_32RGBA888
GLOBAL _ConvertX86p32_32BGRA888
GLOBAL _ConvertX86p32_24RGB888	
GLOBAL _ConvertX86p32_24BGR888
GLOBAL _ConvertX86p32_16RGB565
GLOBAL _ConvertX86p32_16BGR565
GLOBAL _ConvertX86p32_16RGB555
GLOBAL _ConvertX86p32_16BGR555
GLOBAL _ConvertX86p32_8RGB332

EXTERN _x86return
	
SECTION .note.GNU-stack noalloc progbits noexec nowrite
SECTION .text


;; _Convert_*
;; Paramters:	
;;   ESI = source 
;;   EDI = dest
;;   ECX = amount (NOT 0!!! (the _ConvertX86 routine checks for that though))
;; Destroys:
;;   EAX, EBX, EDX


_ConvertX86p32_32BGR888:

    ; check short
    cmp ecx,BYTE 32
    ja .L3

.L1 ; short loop
    mov edx,[esi]
    bswap edx
    ror edx,8
    mov [edi],edx
    add esi,BYTE 4
    add edi,BYTE 4
    dec ecx
    jnz .L1
.L2
    jmp _x86return

.L3 ; save ebp
    push ebp

    ; unroll four times
    mov ebp,ecx
    shr ebp,2
    
    ; save count
    push ecx

.L4     mov eax,[esi]
        mov ebx,[esi+4]

        bswap eax

        bswap ebx

        ror eax,8
        mov ecx,[esi+8]

        ror ebx,8
        mov edx,[esi+12]

        bswap ecx

        bswap edx

        ror ecx,8
        mov [edi+0],eax

        ror edx,8
        mov [edi+4],ebx

        mov [edi+8],ecx
        mov [edi+12],edx

        add esi,BYTE 16
        add edi,BYTE 16

        dec ebp
        jnz .L4                 

    ; check tail
    pop ecx
    and ecx,BYTE 11b
    jz .L6

.L5 ; tail loop
    mov edx,[esi]
    bswap edx
    ror edx,8
    mov [edi],edx
    add esi,BYTE 4
    add edi,BYTE 4
    dec ecx
    jnz .L5

.L6 pop ebp
    jmp _x86return
	

	
		
_ConvertX86p32_32RGBA888:
	
    ; check short
    cmp ecx,BYTE 32
    ja .L3

.L1 ; short loop
    mov edx,[esi]
    rol edx,8
    mov [edi],edx
    add esi,BYTE 4
    add edi,BYTE 4
    dec ecx
    jnz .L1
.L2
    jmp _x86return

.L3 ; save ebp
    push ebp

    ; unroll four times
    mov ebp,ecx
    shr ebp,2
    
    ; save count
    push ecx

.L4     mov eax,[esi]
        mov ebx,[esi+4]

        rol eax,8
        mov ecx,[esi+8]

        rol ebx,8
        mov edx,[esi+12]

        rol ecx,8
        mov [edi+0],eax

        rol edx,8
        mov [edi+4],ebx

        mov [edi+8],ecx
        mov [edi+12],edx

        add esi,BYTE 16
        add edi,BYTE 16

        dec ebp
        jnz .L4                 

    ; check tail
    pop ecx
    and ecx,BYTE 11b
    jz .L6

.L5 ; tail loop
    mov edx,[esi]
    rol edx,8
    mov [edi],edx
    add esi,BYTE 4
    add edi,BYTE 4
    dec ecx
    jnz .L5

.L6 pop ebp
    jmp _x86return

	


_ConvertX86p32_32BGRA888:

    ; check short
    cmp ecx,BYTE 32
    ja .L3

.L1 ; short loop
    mov edx,[esi]
    bswap edx
    mov [edi],edx
    add esi,BYTE 4
    add edi,BYTE 4
    dec ecx
    jnz .L1
.L2
    jmp _x86return

.L3 ; save ebp
    push ebp

    ; unroll four times
    mov ebp,ecx
    shr ebp,2
    
    ; save count
    push ecx

.L4     mov eax,[esi]
        mov ebx,[esi+4]

        mov ecx,[esi+8]
        mov edx,[esi+12]

        bswap eax

        bswap ebx

        bswap ecx

        bswap edx

        mov [edi+0],eax
        mov [edi+4],ebx

        mov [edi+8],ecx
        mov [edi+12],edx

        add esi,BYTE 16
        add edi,BYTE 16

        dec ebp
        jnz .L4                 

    ; check tail
    pop ecx
    and ecx,BYTE 11b
    jz .L6

.L5 ; tail loop
    mov edx,[esi]
    bswap edx
    mov [edi],edx
    add esi,BYTE 4
    add edi,BYTE 4
    dec ecx
    jnz .L5

.L6 pop ebp
    jmp _x86return


	
	
;; 32 bit RGB 888 to 24 BIT RGB 888

_ConvertX86p32_24RGB888:

	; check short
	cmp ecx,BYTE 32
	ja .L3

.L1	; short loop
	mov al,[esi]
	mov bl,[esi+1]
	mov dl,[esi+2]
	mov [edi],al
	mov [edi+1],bl
	mov [edi+2],dl
	add esi,BYTE 4
	add edi,BYTE 3
	dec ecx
	jnz .L1
.L2 
	jmp _x86return

.L3	;	 head
	mov edx,edi
	and edx,BYTE 11b
	jz .L4
	mov al,[esi]
	mov bl,[esi+1]
	mov dl,[esi+2]
	mov [edi],al
	mov [edi+1],bl
	mov [edi+2],dl
	add esi,BYTE 4
	add edi,BYTE 3
	dec ecx
	jmp SHORT .L3

.L4 ; unroll 4 times
	push ebp
	mov ebp,ecx
	shr ebp,2

    ; save count
	push ecx

.L5     mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
        mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]

        shl eax,8                       ;                        eax = [R][G][B][.]
        mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]

        shl ebx,8                       ;                        ebx = [r][g][b][.]
        mov al,[esi+4]                  ;                        eax = [R][G][B][b]

        ror eax,8                       ;                        eax = [b][R][G][B] (done)
        mov bh,[esi+8+1]                ;                        ebx = [r][g][G][.]

        mov [edi],eax
        add edi,BYTE 3*4

        shl ecx,8                       ;                        ecx = [r][g][b][.]
        mov bl,[esi+8+0]                ;                        ebx = [r][g][G][B]

        rol ebx,16                      ;                        ebx = [G][B][r][g] (done)
        mov cl,[esi+8+2]                ;                        ecx = [r][g][b][R] (done)

        mov [edi+4-3*4],ebx
        add esi,BYTE 4*4
        
        mov [edi+8-3*4],ecx
        dec ebp

        jnz .L5

    ; check tail
	pop ecx
	and ecx,BYTE 11b
	jz .L7

.L6 ; tail loop
	mov al,[esi]
	mov bl,[esi+1]
	mov dl,[esi+2]
	mov [edi],al
	mov [edi+1],bl
	mov [edi+2],dl
	add esi,BYTE 4
	add edi,BYTE 3
	dec ecx
	jnz .L6

.L7	pop ebp
	jmp _x86return




;; 32 bit RGB 888 to 24 bit BGR 888

_ConvertX86p32_24BGR888:

	; check short
	cmp ecx,BYTE 32
	ja .L3

	
.L1	; short loop
	mov dl,[esi]
	mov bl,[esi+1]
	mov al,[esi+2]
	mov [edi],al
	mov [edi+1],bl
	mov [edi+2],dl
	add esi,BYTE 4
	add edi,BYTE 3
	dec ecx
	jnz .L1
.L2
	jmp _x86return

.L3 ; head
	mov edx,edi
	and edx,BYTE 11b
	jz .L4
	mov dl,[esi]
	mov bl,[esi+1]
	mov al,[esi+2]
	mov [edi],al
	mov [edi+1],bl
	mov [edi+2],dl
	add esi,BYTE 4
	add edi,BYTE 3
	dec ecx
	jmp SHORT .L3

.L4	; unroll 4 times
	push ebp
	mov ebp,ecx
	shr ebp,2

	; save count
	push ecx

.L5     
	mov eax,[esi]                   ; first dword            eax = [A][R][G][B]
        mov ebx,[esi+4]                 ; second dword           ebx = [a][r][g][b]
        
        bswap eax                       ;                        eax = [B][G][R][A]

        bswap ebx                       ;                        ebx = [b][g][r][a]

        mov al,[esi+4+2]                ;                        eax = [B][G][R][r] 
        mov bh,[esi+4+4+1]              ;                        ebx = [b][g][G][a]

        ror eax,8                       ;                        eax = [r][B][G][R] (done)
        mov bl,[esi+4+4+2]              ;                        ebx = [b][g][G][R]

        ror ebx,16                      ;                        ebx = [G][R][b][g] (done)
        mov [edi],eax
    
        mov [edi+4],ebx
        mov ecx,[esi+12]                ; third dword            ecx = [a][r][g][b]
        
        bswap ecx                       ;                        ecx = [b][g][r][a]
        
        mov cl,[esi+8]                  ;                        ecx = [b][g][r][B] (done)
        add esi,BYTE 4*4

        mov [edi+8],ecx
        add edi,BYTE 3*4

        dec ebp
        jnz .L5

	; check tail
	pop ecx
	and ecx,BYTE 11b
	jz .L7

.L6	; tail loop
	mov dl,[esi]
	mov bl,[esi+1]
	mov al,[esi+2]
	mov [edi],al
	mov [edi+1],bl
	mov [edi+2],dl
	add esi,BYTE 4
	add edi,BYTE 3
	dec ecx
	jnz .L6

.L7 
	pop ebp
	jmp _x86return
 

	
		
;; 32 bit RGB 888 to 16 BIT RGB 565 

_ConvertX86p32_16RGB565:
	; check short
	cmp ecx,BYTE 16
	ja .L3

.L1 ; short loop
	mov bl,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov ah,[esi+2]    ; red
	shr ah,3
        and al,11111100b
	shl eax,3
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2
	dec ecx
	jnz .L1

.L2:				; End of short loop
	jmp _x86return

	
.L3	; head
	mov ebx,edi
	and ebx,BYTE 11b
	jz .L4
	
	mov bl,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov ah,[esi+2]    ; red
	shr ah,3
	and al,11111100b
	shl eax,3
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2
	dec ecx

.L4:	 
    ; save count
	push ecx

    ; unroll twice
	shr ecx,1
    
    ; point arrays to end
	lea esi,[esi+ecx*8]
	lea edi,[edi+ecx*4]

    ; negative counter 
	neg ecx
	jmp SHORT .L6

.L5:	    
	mov [edi+ecx*4-4],eax
.L6:	
	mov eax,[esi+ecx*8]

        shr ah,2
        mov ebx,[esi+ecx*8+4]

        shr eax,3
        mov edx,[esi+ecx*8+4]

        shr bh,2
        mov dl,[esi+ecx*8+2]

        shl ebx,13
        and eax,000007FFh
        
        shl edx,8
        and ebx,07FF0000h

        and edx,0F800F800h
        add eax,ebx

        add eax,edx
        inc ecx

        jnz .L5                 

	mov [edi+ecx*4-4],eax

    ; tail
	pop ecx
	test cl,1
	jz .L7
	
	mov bl,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov ah,[esi+2]    ; red
	shr ah,3
	and al,11111100b
	shl eax,3
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2

.L7:	
	jmp _x86return



	
;; 32 bit RGB 888 to 16 BIT BGR 565 

_ConvertX86p32_16BGR565:
	
	; check short
	cmp ecx,BYTE 16
	ja .L3

.L1	; short loop
	mov ah,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov bl,[esi+2]    ; red
	shr ah,3
	and al,11111100b
	shl eax,3
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2
	dec ecx
	jnz .L1
.L2
	jmp _x86return

.L3	; head
	mov ebx,edi
	and ebx,BYTE 11b
	jz .L4   
	mov ah,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov bl,[esi+2]    ; red
	shr ah,3
	and al,11111100b
	shl eax,3
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2
	dec ecx

.L4	; save count
	push ecx

	; unroll twice
	shr ecx,1
    
	; point arrays to end
	lea esi,[esi+ecx*8]
	lea edi,[edi+ecx*4]

	; negative count
	neg ecx
	jmp SHORT .L6

.L5     
	mov [edi+ecx*4-4],eax            
.L6     
	mov edx,[esi+ecx*8+4]

        mov bh,[esi+ecx*8+4]                       
        mov ah,[esi+ecx*8]                       

        shr bh,3
        mov al,[esi+ecx*8+1]             

        shr ah,3
        mov bl,[esi+ecx*8+5]           

        shl eax,3
        mov dl,[esi+ecx*8+2]

        shl ebx,19
        and eax,0000FFE0h              
                
        shr edx,3
        and ebx,0FFE00000h             
        
        and edx,001F001Fh               
        add eax,ebx

        add eax,edx
        inc ecx

        jnz .L5                 

	mov [edi+ecx*4-4],eax            

	; tail
	pop ecx
	and ecx,BYTE 1
	jz .L7
	mov ah,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov bl,[esi+2]    ; red
	shr ah,3
	and al,11111100b
	shl eax,3
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2

.L7 
	jmp _x86return


	
	
;; 32 BIT RGB TO 16 BIT RGB 555

_ConvertX86p32_16RGB555:

	; check short
	cmp ecx,BYTE 16
	ja .L3

.L1	; short loop
	mov bl,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov ah,[esi+2]    ; red
	shr ah,3
	and al,11111000b
	shl eax,2
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2
	dec ecx
	jnz .L1
.L2
	jmp _x86return

.L3	; head
	mov ebx,edi
        and ebx,BYTE 11b
	jz .L4   
	mov bl,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov ah,[esi+2]    ; red
	shr ah,3
	and al,11111000b
	shl eax,2
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2
	dec ecx

.L4	; save count
	push ecx

	; unroll twice
	shr ecx,1
    
	; point arrays to end
	lea esi,[esi+ecx*8]
	lea edi,[edi+ecx*4]

	; negative counter 
	neg ecx
	jmp SHORT .L6

.L5     
	mov [edi+ecx*4-4],eax
.L6     
	mov eax,[esi+ecx*8]

        shr ah,3
        mov ebx,[esi+ecx*8+4]

        shr eax,3
        mov edx,[esi+ecx*8+4]

        shr bh,3
        mov dl,[esi+ecx*8+2]

        shl ebx,13
        and eax,000007FFh
        
        shl edx,7
        and ebx,07FF0000h

        and edx,07C007C00h
        add eax,ebx

        add eax,edx
        inc ecx

        jnz .L5                 

	mov [edi+ecx*4-4],eax

	; tail
	pop ecx
	and ecx,BYTE 1
	jz .L7
	mov bl,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov ah,[esi+2]    ; red
	shr ah,3
	and al,11111000b
	shl eax,2
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2

.L7
	jmp _x86return




;; 32 BIT RGB TO 16 BIT BGR 555
	
_ConvertX86p32_16BGR555:
	
	; check short
	cmp ecx,BYTE 16
	ja .L3


.L1	; short loop
	mov ah,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov bl,[esi+2]    ; red
	shr ah,3
	and al,11111000b
	shl eax,2
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2
	dec ecx
	jnz .L1
.L2 
	jmp _x86return

.L3	; head
	mov ebx,edi
        and ebx,BYTE 11b
	jz .L4   
	mov ah,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov bl,[esi+2]    ; red
	shr ah,3
	and al,11111000b
	shl eax,2
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2
	dec ecx

.L4	; save count
	push ecx

	; unroll twice
	shr ecx,1
    
	; point arrays to end
	lea esi,[esi+ecx*8]
	lea edi,[edi+ecx*4]

	; negative counter 
	neg ecx
	jmp SHORT .L6

.L5     
	mov [edi+ecx*4-4],eax            
.L6     
	mov edx,[esi+ecx*8+4]

        mov bh,[esi+ecx*8+4]                       
        mov ah,[esi+ecx*8]                       

        shr bh,3
        mov al,[esi+ecx*8+1]             

        shr ah,3
        mov bl,[esi+ecx*8+5]           

        shl eax,2
        mov dl,[esi+ecx*8+2]

        shl ebx,18
        and eax,00007FE0h              
                
        shr edx,3
        and ebx,07FE00000h             
        
        and edx,001F001Fh               
        add eax,ebx

        add eax,edx
        inc ecx

        jnz .L5                 

	mov [edi+ecx*4-4],eax            

	; tail
	pop ecx
	and ecx,BYTE 1
	jz .L7
	mov ah,[esi+0]    ; blue
	mov al,[esi+1]    ; green
	mov bl,[esi+2]    ; red
	shr ah,3
	and al,11111000b
	shl eax,2
	shr bl,3
	add al,bl
	mov [edi+0],al
	mov [edi+1],ah
	add esi,BYTE 4
	add edi,BYTE 2

.L7
	jmp _x86return




	
;; FROM 32 BIT RGB to 8 BIT RGB (rrrgggbbb)
;; This routine writes FOUR pixels at once (dword) and then, if they exist
;; the trailing three pixels
_ConvertX86p32_8RGB332:

	
.L_ALIGNED
	push ecx

	shr ecx,2		; We will draw 4 pixels at once
	jnz .L1
	
	jmp .L2			; short jump out of range :(
	
.L1:
	mov eax,[esi]		; first pair of pixels
	mov edx,[esi+4]

	shr dl,6
	mov ebx,eax

	shr al,6
	and ah,0e0h

	shr ebx,16
	and dh,0e0h
	
	shr ah,3
	and bl,0e0h

	shr dh,3
	
	or al,bl
	
	mov ebx,edx	
	or al,ah
	
	shr ebx,16
	or dl,dh

	and bl,0e0h
	
	or dl,bl

	mov ah,dl

	
		
	mov ebx,[esi+8]		; second pair of pixels

	mov edx,ebx
	and bh,0e0h

	shr bl,6
	and edx,0e00000h

	shr edx,16

	shr bh,3

	ror eax,16
	or bl,dl

	mov edx,[esi+12]
	or bl,bh
	
	mov al,bl

	mov ebx,edx
	and dh,0e0h

	shr dl,6
	and ebx,0e00000h
	
	shr dh,3
	mov ah,dl

	shr ebx,16
	or ah,dh

	or ah,bl

	rol eax,16
	add esi,BYTE 16
			
	mov [edi],eax	
	add edi,BYTE 4
	
	dec ecx
	jz .L2			; L1 out of range for short jump :(
	
	jmp .L1
.L2:
	
	pop ecx
	and ecx,BYTE 3		; mask out number of pixels to draw
	
	jz .L4			; Nothing to do anymore

.L3:
	mov eax,[esi]		; single pixel conversion for trailing pixels

        mov ebx,eax

        shr al,6
        and ah,0e0h

        shr ebx,16

        shr ah,3
        and bl,0e0h

        or al,ah
        or al,bl

        mov [edi],al

        inc edi
        add esi,BYTE 4

	dec ecx
	jnz .L3
	
.L4:	
	jmp _x86return