view src/hermes/x86_main.asm @ 1542:a8bf1aa21020

Fixed bug #15 SDL_blit_A.mmx-speed.patch.txt -- Speed improvements and a bugfix for the current GCC inline mmx asm code: - Changed some ops and removed some resulting useless ones. - Added some instruction parallelism (some gain) The resulting speed on my Xeon improved upto 35% depending on the function (measured in fps). - Fixed a bug where BlitRGBtoRGBSurfaceAlphaMMX() was setting the alpha component on the destination surfaces (to opaque-alpha) even when the surface had none. SDL_blit_A.mmx-msvc.patch.txt -- MSVC mmx intrinsics version of the same GCC asm code. MSVC compiler tries to parallelize the code and to avoid register stalls, but does not always do a very good job. Per-surface blending MSVC functions run quite a bit faster than their pure-asm counterparts (upto 55% faster for 16bit ones), but the per-pixel blending runs somewhat slower than asm. - BlitRGBtoRGBSurfaceAlphaMMX and BlitRGBtoRGBPixelAlphaMMX (and all variants) can now also handle formats other than (A)RGB8888. Formats like RGBA8888 and some quite exotic ones are allowed -- like RAGB8888, or actually anything having channels aligned on 8bit boundary and full 8bit alpha (for per-pixel alpha blending). The performance cost of this change is virtually 0 for per-surface alpha blending (no extra ops inside the loop) and a single non-MMX op inside the loop for per-pixel blending. In testing, the per-pixel alpha blending takes a ~2% performance hit, but it still runs much faster than the current code in CVS. If necessary, a separate function with this functionality can be made. This code requires Processor Pack for VC6.
author Sam Lantinga <slouken@libsdl.org>
date Wed, 15 Mar 2006 15:39:29 +0000
parents 3202d727bb4b
children 393092a3ebf6
line wrap: on
line source

;
; x86 format converters for HERMES
; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
; This source code is licensed under the GNU LGPL
; 
; Please refer to the file COPYING.LIB contained in the distribution for
; licensing conditions		
;
; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission
; 

BITS 32

GLOBAL _ConvertX86
GLOBAL _x86return
	
GLOBAL _Hermes_X86_CPU

SECTION .text
		
;; _ConvertX86:	 
;; [ESP+8] ConverterInfo*
;; --------------------------------------------------------------------------
;; ConverterInfo (ebp+..)
;;   0:	void *s_pixels
;;   4:	int s_width
;;   8:	int s_height
;;  12:	int s_add
;;  16:	void *d_pixels
;;  20:	int d_width
;;  24:	int d_height
;;  28:	int d_add
;;  32:	void (*converter_function)() 
;;  36: int32 *lookup
	
_ConvertX86:
	push ebp
	mov ebp,esp

; Save the registers used by the blitters, necessary for optimized code
	pusha

	mov eax,[ebp+8]

        cmp dword [eax+4],BYTE 0
	je endconvert
	
	mov ebp,eax
	
	mov esi,[ebp+0]
	mov edi,[ebp+16]
	
y_loop:	
	mov ecx,[ebp+4]

	jmp [ebp+32]

_x86return:	
	add esi,[ebp+12]
	add edi,[ebp+28]
	
	dec dword  [ebp+8]
	jnz y_loop

; Restore the registers used by the blitters, necessary for optimized code
	popa
	
	pop ebp

endconvert:	
	ret		



;; Hermes_X86_CPU returns the CPUID flags in eax
	
_Hermes_X86_CPU:
	pushfd
	pop eax
	
	mov ecx,eax
	
	xor eax,040000h
	push eax
	
	popfd
	pushfd

	pop eax
	xor eax,ecx
	jz .L1			; Processor is 386

	push ecx
	popfd

	mov eax,ecx
	xor eax,200000h

	push eax
	popfd
	pushfd

	pop eax
	xor eax,ecx
	je .L1
	
	push ebx

	mov eax,1
	cpuid
	mov eax,edx

	pop ebx

.L1:	
	ret

%ifidn __OUTPUT_FORMAT__,elf
section .note.GNU-stack noalloc noexec nowrite progbits
%endif