Mercurial > sdl-ios-xcode
diff src/hermes/mmxp2_32.asm @ 0:74212992fb08
Initial revision
author | Sam Lantinga <slouken@lokigames.com> |
---|---|
date | Thu, 26 Apr 2001 16:45:43 +0000 |
parents | |
children | 77b6110c797d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/hermes/mmxp2_32.asm Thu Apr 26 16:45:43 2001 +0000 @@ -0,0 +1,386 @@ +; +; pII-optimised MMX format converters for HERMES +; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) +; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) +; This source code is licensed under the GNU LGPL +; +; Please refer to the file COPYING.LIB contained in the distribution for +; licensing conditions +; +; COPYRIGHT NOTICE +; +; This file partly contains code that is (c) Intel Corporation, specifically +; the mode detection routine, and the converter to 15 bit (8 pixel +; conversion routine from the mmx programming tutorial pages). +; +; +; These routines aren't exactly pII optimised - it's just that as they +; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to +; optimise them for p5 MMXs.. + +BITS 32 + + +GLOBAL _ConvertMMXpII32_24RGB888 +GLOBAL _ConvertMMXpII32_16RGB565 +GLOBAL _ConvertMMXpII32_16BGR565 +GLOBAL _ConvertMMXpII32_16RGB555 +GLOBAL _ConvertMMXpII32_16BGR555 + +EXTERN _mmxreturn + +SECTION .data + +ALIGN 8 + +;; Constants for conversion routines + +mmx32_rgb888_mask dd 00ffffffh,00ffffffh + +mmx32_rgb565_b dd 000000f8h, 000000f8h +mmx32_rgb565_g dd 0000fc00h, 0000fc00h +mmx32_rgb565_r dd 00f80000h, 00f80000h + +mmx32_rgb555_rb dd 00f800f8h,00f800f8h +mmx32_rgb555_g dd 0000f800h,0000f800h +mmx32_rgb555_mul dd 20000008h,20000008h +mmx32_bgr555_mul dd 00082000h,00082000h + + + +SECTION .text + +_ConvertMMXpII32_24RGB888: + + ; set up mm6 as the mask, mm7 as zero + movq mm6, qword [mmx32_rgb888_mask] + pxor mm7, mm7 + + mov edx, ecx ; save ecx + and ecx, 0fffffffch ; clear lower two bits + jnz .L1 + jmp .L2 + +.L1: + + movq mm0, [esi] ; A R G B a r g b + pand mm0, mm6 ; 0 R G B 0 r g b + movq mm1, [esi+8] ; A R G B a r g b + pand mm1, mm6 ; 0 R G B 0 r g b + + movq mm2, mm0 ; 0 R G B 0 r g b + punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B + punpckldq mm0, mm7 ; 0 0 0 0 0 r g b + psllq mm2, 24 ; 0 0 R G B 0 0 0 + por mm0, mm2 ; 0 0 R G B r g b + + movq mm3, mm1 ; 0 R G B 0 r g b + psllq mm3, 48 ; g b 0 0 0 0 0 0 + por mm0, mm3 ; g b R G B r g b + + movq mm4, mm1 ; 0 R G B 0 r g b + punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B + punpckldq mm1, mm7 ; 0 0 0 0 0 r g b + psrlq mm1, 16 ; 0 0 0 R G B 0 r + psllq mm4, 8 ; 0 0 0 0 R G B 0 + por mm1, mm4 ; 0 0 0 0 R G B r + + movq [edi], mm0 + add esi, BYTE 16 + movd [edi+8], mm1 + add edi, BYTE 12 + sub ecx, BYTE 4 + jnz .L1 + +.L2: + mov ecx, edx + and ecx, BYTE 3 + jz .L4 +.L3: + mov al, [esi] + mov bl, [esi+1] + mov dl, [esi+2] + mov [edi], al + mov [edi+1], bl + mov [edi+2], dl + add esi, BYTE 4 + add edi, BYTE 3 + dec ecx + jnz .L3 +.L4: + jmp _mmxreturn + + + +_ConvertMMXpII32_16RGB565: + + ; set up masks + movq mm5, [mmx32_rgb565_b] + movq mm6, [mmx32_rgb565_g] + movq mm7, [mmx32_rgb565_r] + + mov edx, ecx + shr ecx, 2 + jnz .L1 + jmp .L2 ; not necessary at the moment, but doesn't hurt (much) + +.L1: + movq mm0, [esi] ; argb + movq mm1, mm0 ; argb + pand mm0, mm6 ; 00g0 + movq mm3, mm1 ; argb + pand mm1, mm5 ; 000b + pand mm3, mm7 ; 0r00 + pslld mm1, 2 ; 0 0 000000bb bbb00000 + por mm0, mm1 ; 0 0 ggggggbb bbb00000 + psrld mm0, 5 ; 0 0 00000ggg gggbbbbb + + movq mm4, [esi+8] ; argb + movq mm2, mm4 ; argb + pand mm4, mm6 ; 00g0 + movq mm1, mm2 ; argb + pand mm2, mm5 ; 000b + pand mm1, mm7 ; 0r00 + pslld mm2, 2 ; 0 0 000000bb bbb00000 + por mm4, mm2 ; 0 0 ggggggbb bbb00000 + psrld mm4, 5 ; 0 0 00000ggg gggbbbbb + + packuswb mm3, mm1 ; R 0 r 0 + packssdw mm0, mm4 ; as above.. ish + por mm0, mm3 ; done. + movq [edi], mm0 + + add esi, 16 + add edi, 8 + dec ecx + jnz .L1 + +.L2: + mov ecx, edx + and ecx, BYTE 3 + jz .L4 +.L3: + mov al, [esi] + mov bh, [esi+1] + mov ah, [esi+2] + shr al, 3 + and eax, 0F81Fh ; BYTE? + shr ebx, 5 + and ebx, 07E0h ; BYTE? + add eax, ebx + mov [edi], al + mov [edi+1], ah + add esi, BYTE 4 + add edi, BYTE 2 + dec ecx + jnz .L3 + +.L4: + jmp _mmxreturn + + +_ConvertMMXpII32_16BGR565: + + movq mm5, [mmx32_rgb565_r] + movq mm6, [mmx32_rgb565_g] + movq mm7, [mmx32_rgb565_b] + + mov edx, ecx + shr ecx, 2 + jnz .L1 + jmp .L2 + +.L1: + movq mm0, [esi] ; a r g b + movq mm1, mm0 ; a r g b + pand mm0, mm6 ; 0 0 g 0 + movq mm3, mm1 ; a r g b + pand mm1, mm5 ; 0 r 0 0 + pand mm3, mm7 ; 0 0 0 b + + psllq mm3, 16 ; 0 b 0 0 + psrld mm1, 14 ; 0 0 000000rr rrr00000 + por mm0, mm1 ; 0 0 ggggggrr rrr00000 + psrld mm0, 5 ; 0 0 00000ggg gggrrrrr + + movq mm4, [esi+8] ; a r g b + movq mm2, mm4 ; a r g b + pand mm4, mm6 ; 0 0 g 0 + movq mm1, mm2 ; a r g b + pand mm2, mm5 ; 0 r 0 0 + pand mm1, mm7 ; 0 0 0 b + + psllq mm1, 16 ; 0 b 0 0 + psrld mm2, 14 ; 0 0 000000rr rrr00000 + por mm4, mm2 ; 0 0 ggggggrr rrr00000 + psrld mm4, 5 ; 0 0 00000ggg gggrrrrr + + packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 + packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR + por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr + movq [edi], mm0 + + add esi, BYTE 16 + add edi, BYTE 8 + dec ecx + jnz .L1 + +.L2: + and edx, BYTE 3 + jz .L4 +.L3: + mov al, [esi+2] + mov bh, [esi+1] + mov ah, [esi] + shr al, 3 + and eax, 0F81Fh ; BYTE ? + shr ebx, 5 + and ebx, 07E0h ; BYTE ? + add eax, ebx + mov [edi], al + mov [edi+1], ah + add esi, BYTE 4 + add edi, BYTE 2 + dec edx + jnz .L3 + +.L4: + jmp _mmxreturn + +_ConvertMMXpII32_16BGR555: + + ; the 16BGR555 converter is identical to the RGB555 one, + ; except it uses a different multiplier for the pmaddwd + ; instruction. cool huh. + + movq mm7, qword [mmx32_bgr555_mul] + jmp _convert_bgr555_cheat + +; This is the same as the Intel version.. they obviously went to +; much more trouble to expand/coil the loop than I did, so theirs +; would almost certainly be faster, even if only a little. +; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is +; (I think) a more accurate name.. +_ConvertMMXpII32_16RGB555: + + movq mm7,qword [mmx32_rgb555_mul] +_convert_bgr555_cheat: + movq mm6,qword [mmx32_rgb555_g] + + mov edx,ecx ; Save ecx + + and ecx,BYTE 0fffffff8h ; clear lower three bits + jnz .L_OK + jmp .L2 + +.L_OK: + + movq mm2,[esi+8] + + movq mm0,[esi] + movq mm3,mm2 + + pand mm3,qword [mmx32_rgb555_rb] + movq mm1,mm0 + + pand mm1,qword [mmx32_rgb555_rb] + pmaddwd mm3,mm7 + + pmaddwd mm1,mm7 + pand mm2,mm6 + +.L1: + movq mm4,[esi+24] + pand mm0,mm6 + + movq mm5,[esi+16] + por mm3,mm2 + + psrld mm3,6 + por mm1,mm0 + + movq mm0,mm4 + psrld mm1,6 + + pand mm0,qword [mmx32_rgb555_rb] + packssdw mm1,mm3 + + movq mm3,mm5 + pmaddwd mm0,mm7 + + pand mm3,qword [mmx32_rgb555_rb] + pand mm4,mm6 + + movq [edi],mm1 + pmaddwd mm3,mm7 + + add esi,BYTE 32 + por mm4,mm0 + + pand mm5,mm6 + psrld mm4,6 + + movq mm2,[esi+8] + por mm5,mm3 + + movq mm0,[esi] + psrld mm5,6 + + movq mm3,mm2 + movq mm1,mm0 + + pand mm3,qword [mmx32_rgb555_rb] + packssdw mm5,mm4 + + pand mm1,qword [mmx32_rgb555_rb] + pand mm2,mm6 + + movq [edi+8],mm5 + pmaddwd mm3,mm7 + + pmaddwd mm1,mm7 + add edi,BYTE 16 + + sub ecx,BYTE 8 + jz .L2 + jmp .L1 + + +.L2: + mov ecx,edx + + and ecx,BYTE 7 + jz .L4 + +.L3: + mov ebx,[esi] + add esi,BYTE 4 + + mov eax,ebx + mov edx,ebx + + shr eax,3 + shr edx,6 + + and eax,BYTE 0000000000011111b + and edx, 0000001111100000b + + shr ebx,9 + + or eax,edx + + and ebx, 0111110000000000b + + or eax,ebx + + mov [edi],ax + add edi,BYTE 2 + + dec ecx + jnz .L3 + +.L4: + jmp _mmxreturn + + +