view src/hermes/mmxp2_32.asm @ 876:9e84d106ec19

(Said Max Horn on the SDL mailing list...) Hi folks, based on Eric Wing's patch, I created the attached patch which fixes the OpenGL coordinate inversion bug in SDL. It works fine over here on 10.3 with Ryan's test program (which I also attached). There is another change in it: I removed the "- 1" in the two lines using CGDisplayPixelsHigh()... while I understand from a logical point of view why they *should* be correct, I checked the actual values computed that way, and they were off-by-one. After removing the " - 1", the returned mouse coordinates are correct. I checked this by moving the mouse to the screen top/bottom in fullscreen mode, BTW. With the change, the proper values 0 and 479 are returned (in 640x480 mode). Sam, you may still want to test on 10.1, it's very simple using Ryan's minimal test code :-) Cheers, Max (Here is the reproduction case for revision history's sake...) /* * To compile: * gcc -o test test.c `sdl-config --cflags` `sdl-config --libs` -framework OpenGL * * --ryan. */ #include <stdio.h> #include "SDL.h" #include "SDL_opengl.h" int main(int argc, char **argv) { Uint32 flags = SDL_OPENGL /* | SDL_FULLSCREEN */; SDL_Surface *screen; SDL_Event event; int done = 0; GLfloat ratio; SDL_Init(SDL_INIT_VIDEO); SDL_ShowCursor(0); if ((argv[1]) && (strcmp(argv[1], "--grab") == 0)) SDL_WM_GrabInput(SDL_GRAB_ON); screen = SDL_SetVideoMode(640, 480, 0, flags); if (!screen) return(42); ratio = ((GLfloat) screen->w) / ((GLfloat) screen->h); glClearColor( 0.0f, 0.0f, 0.0f, 0.0f ); glClearDepth( 1.0f ); glEnable( GL_DEPTH_TEST ); glDepthFunc( GL_LEQUAL ); glViewport( 0, 0, screen->w, screen->h); glMatrixMode( GL_PROJECTION ); glLoadIdentity(); gluPerspective( 45.0f, ratio, 0.1f, 100.0f ); glMatrixMode( GL_MODELVIEW ); glLoadIdentity(); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); SDL_GL_SwapBuffers(); // eh, close enough. #define MAX_X 6.12 #define MAX_Y 4.50 while (!done) { int x, y; GLfloat glx, gly; if (!SDL_WaitEvent(&event)) break; switch (event.type) { case SDL_KEYUP: if (event.key.keysym.sym == SDLK_ESCAPE) done = 1; break; } SDL_GetMouseState(&x, &y); glx = ((((GLfloat) x) / ((GLfloat) screen->w)) - 0.5f) * MAX_X; gly = ((((GLfloat) y) / ((GLfloat) screen->h)) - 0.5f) * MAX_Y; glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); glLoadIdentity(); glTranslatef(glx,-gly,-6.0f); glBegin(GL_TRIANGLES); glColor3f(1,0,0); glVertex3f( 0.00f, 0.25f, 0.00f); glColor3f(0,1,0); glVertex3f(-0.25f, -0.25f, 0.00f); glColor3f(0,0,1); glVertex3f( 0.25f, -0.25f, 0.00f); glEnd(); SDL_GL_SwapBuffers(); } SDL_Quit(); return(0); } /* end of test.c ... */
author Ryan C. Gordon <icculus@icculus.org>
date Mon, 22 Mar 2004 09:38:20 +0000
parents 77b6110c797d
children da33b7e6d181
line wrap: on
line source

;
; pII-optimised MMX format converters for HERMES
; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
; This source code is licensed under the GNU LGPL
; 
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions		
;
; COPYRIGHT NOTICE
; 
; This file partly contains code that is (c) Intel Corporation, specifically
; the mode detection routine, and the converter to 15 bit (8 pixel
; conversion routine from the mmx programming tutorial pages).
;
;
; These routines aren't exactly pII optimised - it's just that as they
; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
; optimise them for p5 MMXs..

BITS 32

	
GLOBAL _ConvertMMXpII32_24RGB888
GLOBAL _ConvertMMXpII32_16RGB565
GLOBAL _ConvertMMXpII32_16BGR565
GLOBAL _ConvertMMXpII32_16RGB555
GLOBAL _ConvertMMXpII32_16BGR555

EXTERN _mmxreturn
 
SECTION .data
	
ALIGN 8

;; Constants for conversion routines

mmx32_rgb888_mask dd 00ffffffh,00ffffffh

mmx32_rgb565_b dd 000000f8h, 000000f8h
mmx32_rgb565_g dd 0000fc00h, 0000fc00h
mmx32_rgb565_r dd 00f80000h, 00f80000h

mmx32_rgb555_rb dd 00f800f8h,00f800f8h
mmx32_rgb555_g dd 0000f800h,0000f800h
mmx32_rgb555_mul dd 20000008h,20000008h
mmx32_bgr555_mul dd 00082000h,00082000h


			
SECTION .text

_ConvertMMXpII32_24RGB888:

        ; set up mm6 as the mask, mm7 as zero
        movq mm6, qword [mmx32_rgb888_mask]
        pxor mm7, mm7

        mov edx, ecx                    ; save ecx
        and ecx, 0fffffffch             ; clear lower two bits
        jnz .L1
        jmp .L2

.L1:

        movq mm0, [esi]                 ; A R G B a r g b
        pand mm0, mm6                   ; 0 R G B 0 r g b
        movq mm1, [esi+8]               ; A R G B a r g b
        pand mm1, mm6                   ; 0 R G B 0 r g b

        movq mm2, mm0                   ; 0 R G B 0 r g b
        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
        psllq mm2, 24                   ; 0 0 R G B 0 0 0
        por mm0, mm2                    ; 0 0 R G B r g b

        movq mm3, mm1                   ; 0 R G B 0 r g b
        psllq mm3, 48                   ; g b 0 0 0 0 0 0
        por mm0, mm3                    ; g b R G B r g b

        movq mm4, mm1                   ; 0 R G B 0 r g b
        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
        psllq mm4, 8                    ; 0 0 0 0 R G B 0
        por mm1, mm4                    ; 0 0 0 0 R G B r

        movq [edi], mm0
        add esi, BYTE 16
        movd [edi+8], mm1
        add edi, BYTE 12
        sub ecx, BYTE 4
        jnz .L1

.L2:
        mov ecx, edx
        and ecx, BYTE 3
        jz .L4
.L3:
        mov al, [esi]
        mov bl, [esi+1]
        mov dl, [esi+2]
        mov [edi], al
        mov [edi+1], bl
        mov [edi+2], dl
        add esi, BYTE 4
        add edi, BYTE 3
        dec ecx
        jnz .L3
.L4:
        jmp _mmxreturn



_ConvertMMXpII32_16RGB565:

        ; set up masks
        movq mm5, [mmx32_rgb565_b]
        movq mm6, [mmx32_rgb565_g]
        movq mm7, [mmx32_rgb565_r]

        mov edx, ecx
        shr ecx, 2
        jnz .L1
        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)

.L1:
        movq mm0, [esi]         ; argb
        movq mm1, mm0           ; argb
        pand mm0, mm6           ; 00g0
        movq mm3, mm1           ; argb
        pand mm1, mm5           ; 000b
        pand mm3, mm7           ; 0r00
        pslld mm1, 2            ; 0 0 000000bb bbb00000
        por mm0, mm1            ; 0 0 ggggggbb bbb00000
        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb

        movq mm4, [esi+8]       ; argb
        movq mm2, mm4           ; argb
        pand mm4, mm6           ; 00g0
        movq mm1, mm2           ; argb
        pand mm2, mm5           ; 000b
        pand mm1, mm7           ; 0r00
        pslld mm2, 2            ; 0 0 000000bb bbb00000
        por mm4, mm2            ; 0 0 ggggggbb bbb00000
        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb

        packuswb mm3, mm1       ; R 0 r 0
        packssdw mm0, mm4       ; as above.. ish
        por mm0, mm3            ; done.
        movq [edi], mm0

        add esi, 16
        add edi, 8
        dec ecx
        jnz .L1

.L2:
        mov ecx, edx
        and ecx, BYTE 3
        jz .L4
.L3:
        mov al, [esi]
        mov bh, [esi+1]
        mov ah, [esi+2]
        shr al, 3
        and eax, 0F81Fh            ; BYTE?
        shr ebx, 5
        and ebx, 07E0h             ; BYTE?
        add eax, ebx
        mov [edi], al
        mov [edi+1], ah
        add esi, BYTE 4
        add edi, BYTE 2
        dec ecx
        jnz .L3

.L4:
	jmp _mmxreturn

	
_ConvertMMXpII32_16BGR565:

        movq mm5, [mmx32_rgb565_r]
        movq mm6, [mmx32_rgb565_g]
        movq mm7, [mmx32_rgb565_b]

        mov edx, ecx
        shr ecx, 2
        jnz .L1
        jmp .L2

.L1:
        movq mm0, [esi]                 ; a r g b
        movq mm1, mm0                   ; a r g b
        pand mm0, mm6                   ; 0 0 g 0
        movq mm3, mm1                   ; a r g b
        pand mm1, mm5                   ; 0 r 0 0
        pand mm3, mm7                   ; 0 0 0 b

        psllq mm3, 16                   ; 0 b 0 0
        psrld mm1, 14                   ; 0 0 000000rr rrr00000
        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr

        movq mm4, [esi+8]               ; a r g b
        movq mm2, mm4                   ; a r g b
        pand mm4, mm6                   ; 0 0 g 0
        movq mm1, mm2                   ; a r g b
        pand mm2, mm5                   ; 0 r 0 0
        pand mm1, mm7                   ; 0 0 0 b

        psllq mm1, 16                   ; 0 b 0 0
        psrld mm2, 14                   ; 0 0 000000rr rrr00000
        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr

        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
        movq [edi], mm0

        add esi, BYTE 16
        add edi, BYTE 8
        dec ecx
        jnz .L1

.L2:
        and edx, BYTE 3
        jz .L4
.L3:
        mov al, [esi+2]
        mov bh, [esi+1]
        mov ah, [esi]
        shr al, 3
        and eax, 0F81Fh                    ; BYTE ?
        shr ebx, 5
        and ebx, 07E0h                     ; BYTE ?
        add eax, ebx
        mov [edi], al
        mov [edi+1], ah
        add esi, BYTE 4
        add edi, BYTE 2
        dec edx
        jnz .L3

.L4:
        jmp _mmxreturn

_ConvertMMXpII32_16BGR555:

        ; the 16BGR555 converter is identical to the RGB555 one,
        ; except it uses a different multiplier for the pmaddwd
        ; instruction.  cool huh.

        movq mm7, qword [mmx32_bgr555_mul]
        jmp _convert_bgr555_cheat

; This is the same as the Intel version.. they obviously went to
; much more trouble to expand/coil the loop than I did, so theirs
; would almost certainly be faster, even if only a little.
; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
; (I think) a more accurate name..
_ConvertMMXpII32_16RGB555:

        movq mm7,qword [mmx32_rgb555_mul]
_convert_bgr555_cheat:
        movq mm6,qword [mmx32_rgb555_g]
        
	mov edx,ecx		           ; Save ecx 

        and ecx,BYTE 0fffffff8h            ; clear lower three bits
	jnz .L_OK
        jmp near .L2 

.L_OK:
	
	movq mm2,[esi+8]

	movq mm0,[esi]
	movq mm3,mm2

	pand mm3,qword [mmx32_rgb555_rb]
	movq mm1,mm0

	pand mm1,qword [mmx32_rgb555_rb]
	pmaddwd mm3,mm7

	pmaddwd mm1,mm7
	pand mm2,mm6

.L1:
	movq mm4,[esi+24]
	pand mm0,mm6

	movq mm5,[esi+16]
	por mm3,mm2

	psrld mm3,6
	por mm1,mm0

	movq mm0,mm4
	psrld mm1,6

	pand mm0,qword [mmx32_rgb555_rb]
	packssdw mm1,mm3

	movq mm3,mm5
	pmaddwd mm0,mm7

	pand mm3,qword [mmx32_rgb555_rb]
	pand mm4,mm6

	movq [edi],mm1			
	pmaddwd mm3,mm7

        add esi,BYTE 32
	por mm4,mm0

	pand mm5,mm6
	psrld mm4,6

	movq mm2,[esi+8]
	por mm5,mm3

	movq mm0,[esi]
	psrld mm5,6

	movq mm3,mm2
	movq mm1,mm0

	pand mm3,qword [mmx32_rgb555_rb]
	packssdw mm5,mm4

	pand mm1,qword [mmx32_rgb555_rb]
	pand mm2,mm6

	movq [edi+8],mm5
	pmaddwd mm3,mm7

	pmaddwd mm1,mm7
        add edi,BYTE 16
	
        sub ecx,BYTE 8
	jz .L2
        jmp .L1


.L2:	
	mov ecx,edx
	
        and ecx,BYTE 7
	jz .L4
	
.L3:	
	mov ebx,[esi]
        add esi,BYTE 4
	
        mov eax,ebx
        mov edx,ebx

        shr eax,3
        shr edx,6

        and eax,BYTE 0000000000011111b
        and edx,     0000001111100000b

        shr ebx,9

        or eax,edx

        and ebx,     0111110000000000b

        or eax,ebx

        mov [edi],ax
        add edi,BYTE 2

	dec ecx
	jnz .L3	

.L4:		
	jmp _mmxreturn