Mercurial > sdl-ios-xcode
diff src/hermes/x86p_32.asm @ 0:74212992fb08
Initial revision
author | Sam Lantinga <slouken@lokigames.com> |
---|---|
date | Thu, 26 Apr 2001 16:45:43 +0000 |
parents | |
children | da33b7e6d181 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/hermes/x86p_32.asm Thu Apr 26 16:45:43 2001 +0000 @@ -0,0 +1,1043 @@ +; +; x86 format converters for HERMES +; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at) +; This source code is licensed under the GNU LGPL +; +; Please refer to the file COPYING.LIB contained in the distribution for +; licensing conditions +; +; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission +; + + +BITS 32 + +GLOBAL _ConvertX86p32_32BGR888 +GLOBAL _ConvertX86p32_32RGBA888 +GLOBAL _ConvertX86p32_32BGRA888 +GLOBAL _ConvertX86p32_24RGB888 +GLOBAL _ConvertX86p32_24BGR888 +GLOBAL _ConvertX86p32_16RGB565 +GLOBAL _ConvertX86p32_16BGR565 +GLOBAL _ConvertX86p32_16RGB555 +GLOBAL _ConvertX86p32_16BGR555 +GLOBAL _ConvertX86p32_8RGB332 + +EXTERN _x86return + +SECTION .text + + +;; _Convert_* +;; Paramters: +;; ESI = source +;; EDI = dest +;; ECX = amount (NOT 0!!! (the _ConvertX86 routine checks for that though)) +;; Destroys: +;; EAX, EBX, EDX + + +_ConvertX86p32_32BGR888: + + ; check short + cmp ecx,BYTE 32 + ja .L3 + +.L1 ; short loop + mov edx,[esi] + bswap edx + ror edx,8 + mov [edi],edx + add esi,BYTE 4 + add edi,BYTE 4 + dec ecx + jnz .L1 +.L2 + jmp _x86return + +.L3 ; save ebp + push ebp + + ; unroll four times + mov ebp,ecx + shr ebp,2 + + ; save count + push ecx + +.L4 mov eax,[esi] + mov ebx,[esi+4] + + bswap eax + + bswap ebx + + ror eax,8 + mov ecx,[esi+8] + + ror ebx,8 + mov edx,[esi+12] + + bswap ecx + + bswap edx + + ror ecx,8 + mov [edi+0],eax + + ror edx,8 + mov [edi+4],ebx + + mov [edi+8],ecx + mov [edi+12],edx + + add esi,BYTE 16 + add edi,BYTE 16 + + dec ebp + jnz .L4 + + ; check tail + pop ecx + and ecx,BYTE 11b + jz .L6 + +.L5 ; tail loop + mov edx,[esi] + bswap edx + ror edx,8 + mov [edi],edx + add esi,BYTE 4 + add edi,BYTE 4 + dec ecx + jnz .L5 + +.L6 pop ebp + jmp _x86return + + + + +_ConvertX86p32_32RGBA888: + + ; check short + cmp ecx,BYTE 32 + ja .L3 + +.L1 ; short loop + mov edx,[esi] + rol edx,8 + mov [edi],edx + add esi,BYTE 4 + add edi,BYTE 4 + dec ecx + jnz .L1 +.L2 + jmp _x86return + +.L3 ; save ebp + push ebp + + ; unroll four times + mov ebp,ecx + shr ebp,2 + + ; save count + push ecx + +.L4 mov eax,[esi] + mov ebx,[esi+4] + + rol eax,8 + mov ecx,[esi+8] + + rol ebx,8 + mov edx,[esi+12] + + rol ecx,8 + mov [edi+0],eax + + rol edx,8 + mov [edi+4],ebx + + mov [edi+8],ecx + mov [edi+12],edx + + add esi,BYTE 16 + add edi,BYTE 16 + + dec ebp + jnz .L4 + + ; check tail + pop ecx + and ecx,BYTE 11b + jz .L6 + +.L5 ; tail loop + mov edx,[esi] + rol edx,8 + mov [edi],edx + add esi,BYTE 4 + add edi,BYTE 4 + dec ecx + jnz .L5 + +.L6 pop ebp + jmp _x86return + + + + +_ConvertX86p32_32BGRA888: + + ; check short + cmp ecx,BYTE 32 + ja .L3 + +.L1 ; short loop + mov edx,[esi] + bswap edx + mov [edi],edx + add esi,BYTE 4 + add edi,BYTE 4 + dec ecx + jnz .L1 +.L2 + jmp _x86return + +.L3 ; save ebp + push ebp + + ; unroll four times + mov ebp,ecx + shr ebp,2 + + ; save count + push ecx + +.L4 mov eax,[esi] + mov ebx,[esi+4] + + mov ecx,[esi+8] + mov edx,[esi+12] + + bswap eax + + bswap ebx + + bswap ecx + + bswap edx + + mov [edi+0],eax + mov [edi+4],ebx + + mov [edi+8],ecx + mov [edi+12],edx + + add esi,BYTE 16 + add edi,BYTE 16 + + dec ebp + jnz .L4 + + ; check tail + pop ecx + and ecx,BYTE 11b + jz .L6 + +.L5 ; tail loop + mov edx,[esi] + bswap edx + mov [edi],edx + add esi,BYTE 4 + add edi,BYTE 4 + dec ecx + jnz .L5 + +.L6 pop ebp + jmp _x86return + + + + +;; 32 bit RGB 888 to 24 BIT RGB 888 + +_ConvertX86p32_24RGB888: + + ; check short + cmp ecx,BYTE 32 + ja .L3 + +.L1 ; short loop + mov al,[esi] + mov bl,[esi+1] + mov dl,[esi+2] + mov [edi],al + mov [edi+1],bl + mov [edi+2],dl + add esi,BYTE 4 + add edi,BYTE 3 + dec ecx + jnz .L1 +.L2 + jmp _x86return + +.L3 ; head + mov edx,edi + and edx,BYTE 11b + jz .L4 + mov al,[esi] + mov bl,[esi+1] + mov dl,[esi+2] + mov [edi],al + mov [edi+1],bl + mov [edi+2],dl + add esi,BYTE 4 + add edi,BYTE 3 + dec ecx + jmp SHORT .L3 + +.L4 ; unroll 4 times + push ebp + mov ebp,ecx + shr ebp,2 + + ; save count + push ecx + +.L5 mov eax,[esi] ; first dword eax = [A][R][G][B] + mov ebx,[esi+4] ; second dword ebx = [a][r][g][b] + + shl eax,8 ; eax = [R][G][B][.] + mov ecx,[esi+12] ; third dword ecx = [a][r][g][b] + + shl ebx,8 ; ebx = [r][g][b][.] + mov al,[esi+4] ; eax = [R][G][B][b] + + ror eax,8 ; eax = [b][R][G][B] (done) + mov bh,[esi+8+1] ; ebx = [r][g][G][.] + + mov [edi],eax + add edi,BYTE 3*4 + + shl ecx,8 ; ecx = [r][g][b][.] + mov bl,[esi+8+0] ; ebx = [r][g][G][B] + + rol ebx,16 ; ebx = [G][B][r][g] (done) + mov cl,[esi+8+2] ; ecx = [r][g][b][R] (done) + + mov [edi+4-3*4],ebx + add esi,BYTE 4*4 + + mov [edi+8-3*4],ecx + dec ebp + + jnz .L5 + + ; check tail + pop ecx + and ecx,BYTE 11b + jz .L7 + +.L6 ; tail loop + mov al,[esi] + mov bl,[esi+1] + mov dl,[esi+2] + mov [edi],al + mov [edi+1],bl + mov [edi+2],dl + add esi,BYTE 4 + add edi,BYTE 3 + dec ecx + jnz .L6 + +.L7 pop ebp + jmp _x86return + + + + +;; 32 bit RGB 888 to 24 bit BGR 888 + +_ConvertX86p32_24BGR888: + + ; check short + cmp ecx,BYTE 32 + ja .L3 + + +.L1 ; short loop + mov dl,[esi] + mov bl,[esi+1] + mov al,[esi+2] + mov [edi],al + mov [edi+1],bl + mov [edi+2],dl + add esi,BYTE 4 + add edi,BYTE 3 + dec ecx + jnz .L1 +.L2 + jmp _x86return + +.L3 ; head + mov edx,edi + and edx,BYTE 11b + jz .L4 + mov dl,[esi] + mov bl,[esi+1] + mov al,[esi+2] + mov [edi],al + mov [edi+1],bl + mov [edi+2],dl + add esi,BYTE 4 + add edi,BYTE 3 + dec ecx + jmp SHORT .L3 + +.L4 ; unroll 4 times + push ebp + mov ebp,ecx + shr ebp,2 + + ; save count + push ecx + +.L5 + mov eax,[esi] ; first dword eax = [A][R][G][B] + mov ebx,[esi+4] ; second dword ebx = [a][r][g][b] + + bswap eax ; eax = [B][G][R][A] + + bswap ebx ; ebx = [b][g][r][a] + + mov al,[esi+4+2] ; eax = [B][G][R][r] + mov bh,[esi+4+4+1] ; ebx = [b][g][G][a] + + ror eax,8 ; eax = [r][B][G][R] (done) + mov bl,[esi+4+4+2] ; ebx = [b][g][G][R] + + ror ebx,16 ; ebx = [G][R][b][g] (done) + mov [edi],eax + + mov [edi+4],ebx + mov ecx,[esi+12] ; third dword ecx = [a][r][g][b] + + bswap ecx ; ecx = [b][g][r][a] + + mov cl,[esi+8] ; ecx = [b][g][r][B] (done) + add esi,BYTE 4*4 + + mov [edi+8],ecx + add edi,BYTE 3*4 + + dec ebp + jnz .L5 + + ; check tail + pop ecx + and ecx,BYTE 11b + jz .L7 + +.L6 ; tail loop + mov dl,[esi] + mov bl,[esi+1] + mov al,[esi+2] + mov [edi],al + mov [edi+1],bl + mov [edi+2],dl + add esi,BYTE 4 + add edi,BYTE 3 + dec ecx + jnz .L6 + +.L7 + pop ebp + jmp _x86return + + + + +;; 32 bit RGB 888 to 16 BIT RGB 565 + +_ConvertX86p32_16RGB565: + ; check short + cmp ecx,BYTE 16 + ja .L3 + +.L1 ; short loop + mov bl,[esi+0] ; blue + mov al,[esi+1] ; green + mov ah,[esi+2] ; red + shr ah,3 + and al,11111100b + shl eax,3 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + dec ecx + jnz .L1 + +.L2: ; End of short loop + jmp _x86return + + +.L3 ; head + mov ebx,edi + and ebx,BYTE 11b + jz .L4 + + mov bl,[esi+0] ; blue + mov al,[esi+1] ; green + mov ah,[esi+2] ; red + shr ah,3 + and al,11111100b + shl eax,3 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + dec ecx + +.L4: + ; save count + push ecx + + ; unroll twice + shr ecx,1 + + ; point arrays to end + lea esi,[esi+ecx*8] + lea edi,[edi+ecx*4] + + ; negative counter + neg ecx + jmp SHORT .L6 + +.L5: + mov [edi+ecx*4-4],eax +.L6: + mov eax,[esi+ecx*8] + + shr ah,2 + mov ebx,[esi+ecx*8+4] + + shr eax,3 + mov edx,[esi+ecx*8+4] + + shr bh,2 + mov dl,[esi+ecx*8+2] + + shl ebx,13 + and eax,000007FFh + + shl edx,8 + and ebx,07FF0000h + + and edx,0F800F800h + add eax,ebx + + add eax,edx + inc ecx + + jnz .L5 + + mov [edi+ecx*4-4],eax + + ; tail + pop ecx + test cl,1 + jz .L7 + + mov bl,[esi+0] ; blue + mov al,[esi+1] ; green + mov ah,[esi+2] ; red + shr ah,3 + and al,11111100b + shl eax,3 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + +.L7: + jmp _x86return + + + + +;; 32 bit RGB 888 to 16 BIT BGR 565 + +_ConvertX86p32_16BGR565: + + ; check short + cmp ecx,BYTE 16 + ja .L3 + +.L1 ; short loop + mov ah,[esi+0] ; blue + mov al,[esi+1] ; green + mov bl,[esi+2] ; red + shr ah,3 + and al,11111100b + shl eax,3 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + dec ecx + jnz .L1 +.L2 + jmp _x86return + +.L3 ; head + mov ebx,edi + and ebx,BYTE 11b + jz .L4 + mov ah,[esi+0] ; blue + mov al,[esi+1] ; green + mov bl,[esi+2] ; red + shr ah,3 + and al,11111100b + shl eax,3 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + dec ecx + +.L4 ; save count + push ecx + + ; unroll twice + shr ecx,1 + + ; point arrays to end + lea esi,[esi+ecx*8] + lea edi,[edi+ecx*4] + + ; negative count + neg ecx + jmp SHORT .L6 + +.L5 + mov [edi+ecx*4-4],eax +.L6 + mov edx,[esi+ecx*8+4] + + mov bh,[esi+ecx*8+4] + mov ah,[esi+ecx*8] + + shr bh,3 + mov al,[esi+ecx*8+1] + + shr ah,3 + mov bl,[esi+ecx*8+5] + + shl eax,3 + mov dl,[esi+ecx*8+2] + + shl ebx,19 + and eax,0000FFE0h + + shr edx,3 + and ebx,0FFE00000h + + and edx,001F001Fh + add eax,ebx + + add eax,edx + inc ecx + + jnz .L5 + + mov [edi+ecx*4-4],eax + + ; tail + pop ecx + and ecx,BYTE 1 + jz .L7 + mov ah,[esi+0] ; blue + mov al,[esi+1] ; green + mov bl,[esi+2] ; red + shr ah,3 + and al,11111100b + shl eax,3 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + +.L7 + jmp _x86return + + + + +;; 32 BIT RGB TO 16 BIT RGB 555 + +_ConvertX86p32_16RGB555: + + ; check short + cmp ecx,BYTE 16 + ja .L3 + +.L1 ; short loop + mov bl,[esi+0] ; blue + mov al,[esi+1] ; green + mov ah,[esi+2] ; red + shr ah,3 + and al,11111000b + shl eax,2 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + dec ecx + jnz .L1 +.L2 + jmp _x86return + +.L3 ; head + mov ebx,edi + and ebx,BYTE 11b + jz .L4 + mov bl,[esi+0] ; blue + mov al,[esi+1] ; green + mov ah,[esi+2] ; red + shr ah,3 + and al,11111000b + shl eax,2 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + dec ecx + +.L4 ; save count + push ecx + + ; unroll twice + shr ecx,1 + + ; point arrays to end + lea esi,[esi+ecx*8] + lea edi,[edi+ecx*4] + + ; negative counter + neg ecx + jmp SHORT .L6 + +.L5 + mov [edi+ecx*4-4],eax +.L6 + mov eax,[esi+ecx*8] + + shr ah,3 + mov ebx,[esi+ecx*8+4] + + shr eax,3 + mov edx,[esi+ecx*8+4] + + shr bh,3 + mov dl,[esi+ecx*8+2] + + shl ebx,13 + and eax,000007FFh + + shl edx,7 + and ebx,07FF0000h + + and edx,07C007C00h + add eax,ebx + + add eax,edx + inc ecx + + jnz .L5 + + mov [edi+ecx*4-4],eax + + ; tail + pop ecx + and ecx,BYTE 1 + jz .L7 + mov bl,[esi+0] ; blue + mov al,[esi+1] ; green + mov ah,[esi+2] ; red + shr ah,3 + and al,11111000b + shl eax,2 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + +.L7 + jmp _x86return + + + + +;; 32 BIT RGB TO 16 BIT BGR 555 + +_ConvertX86p32_16BGR555: + + ; check short + cmp ecx,BYTE 16 + ja .L3 + + +.L1 ; short loop + mov ah,[esi+0] ; blue + mov al,[esi+1] ; green + mov bl,[esi+2] ; red + shr ah,3 + and al,11111000b + shl eax,2 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + dec ecx + jnz .L1 +.L2 + jmp _x86return + +.L3 ; head + mov ebx,edi + and ebx,BYTE 11b + jz .L4 + mov ah,[esi+0] ; blue + mov al,[esi+1] ; green + mov bl,[esi+2] ; red + shr ah,3 + and al,11111000b + shl eax,2 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + dec ecx + +.L4 ; save count + push ecx + + ; unroll twice + shr ecx,1 + + ; point arrays to end + lea esi,[esi+ecx*8] + lea edi,[edi+ecx*4] + + ; negative counter + neg ecx + jmp SHORT .L6 + +.L5 + mov [edi+ecx*4-4],eax +.L6 + mov edx,[esi+ecx*8+4] + + mov bh,[esi+ecx*8+4] + mov ah,[esi+ecx*8] + + shr bh,3 + mov al,[esi+ecx*8+1] + + shr ah,3 + mov bl,[esi+ecx*8+5] + + shl eax,2 + mov dl,[esi+ecx*8+2] + + shl ebx,18 + and eax,00007FE0h + + shr edx,3 + and ebx,07FE00000h + + and edx,001F001Fh + add eax,ebx + + add eax,edx + inc ecx + + jnz .L5 + + mov [edi+ecx*4-4],eax + + ; tail + pop ecx + and ecx,BYTE 1 + jz .L7 + mov ah,[esi+0] ; blue + mov al,[esi+1] ; green + mov bl,[esi+2] ; red + shr ah,3 + and al,11111000b + shl eax,2 + shr bl,3 + add al,bl + mov [edi+0],al + mov [edi+1],ah + add esi,BYTE 4 + add edi,BYTE 2 + +.L7 + jmp _x86return + + + + + +;; FROM 32 BIT RGB to 8 BIT RGB (rrrgggbbb) +;; This routine writes FOUR pixels at once (dword) and then, if they exist +;; the trailing three pixels +_ConvertX86p32_8RGB332: + + +.L_ALIGNED + push ecx + + shr ecx,2 ; We will draw 4 pixels at once + jnz .L1 + + jmp .L2 ; short jump out of range :( + +.L1: + mov eax,[esi] ; first pair of pixels + mov edx,[esi+4] + + shr dl,6 + mov ebx,eax + + shr al,6 + and ah,0e0h + + shr ebx,16 + and dh,0e0h + + shr ah,3 + and bl,0e0h + + shr dh,3 + + or al,bl + + mov ebx,edx + or al,ah + + shr ebx,16 + or dl,dh + + and bl,0e0h + + or dl,bl + + mov ah,dl + + + + mov ebx,[esi+8] ; second pair of pixels + + mov edx,ebx + and bh,0e0h + + shr bl,6 + and edx,0e00000h + + shr edx,16 + + shr bh,3 + + ror eax,16 + or bl,dl + + mov edx,[esi+12] + or bl,bh + + mov al,bl + + mov ebx,edx + and dh,0e0h + + shr dl,6 + and ebx,0e00000h + + shr dh,3 + mov ah,dl + + shr ebx,16 + or ah,dh + + or ah,bl + + rol eax,16 + add esi,BYTE 16 + + mov [edi],eax + add edi,BYTE 4 + + dec ecx + jz .L2 ; L1 out of range for short jump :( + + jmp .L1 +.L2: + + pop ecx + and ecx,BYTE 3 ; mask out number of pixels to draw + + jz .L4 ; Nothing to do anymore + +.L3: + mov eax,[esi] ; single pixel conversion for trailing pixels + + mov ebx,eax + + shr al,6 + and ah,0e0h + + shr ebx,16 + + shr ah,3 + and bl,0e0h + + or al,ah + or al,bl + + mov [edi],al + + inc edi + add esi,BYTE 4 + + dec ecx + jnz .L3 + +.L4: + jmp _x86return