view src/hermes/HeadX86.h @ 1542:a8bf1aa21020

Fixed bug #15 SDL_blit_A.mmx-speed.patch.txt -- Speed improvements and a bugfix for the current GCC inline mmx asm code: - Changed some ops and removed some resulting useless ones. - Added some instruction parallelism (some gain) The resulting speed on my Xeon improved upto 35% depending on the function (measured in fps). - Fixed a bug where BlitRGBtoRGBSurfaceAlphaMMX() was setting the alpha component on the destination surfaces (to opaque-alpha) even when the surface had none. SDL_blit_A.mmx-msvc.patch.txt -- MSVC mmx intrinsics version of the same GCC asm code. MSVC compiler tries to parallelize the code and to avoid register stalls, but does not always do a very good job. Per-surface blending MSVC functions run quite a bit faster than their pure-asm counterparts (upto 55% faster for 16bit ones), but the per-pixel blending runs somewhat slower than asm. - BlitRGBtoRGBSurfaceAlphaMMX and BlitRGBtoRGBPixelAlphaMMX (and all variants) can now also handle formats other than (A)RGB8888. Formats like RGBA8888 and some quite exotic ones are allowed -- like RAGB8888, or actually anything having channels aligned on 8bit boundary and full 8bit alpha (for per-pixel alpha blending). The performance cost of this change is virtually 0 for per-surface alpha blending (no extra ops inside the loop) and a single non-MMX op inside the loop for per-pixel blending. In testing, the per-pixel alpha blending takes a ~2% performance hit, but it still runs much faster than the current code in CVS. If necessary, a separate function with this functionality can be made. This code requires Processor Pack for VC6.
author Sam Lantinga <slouken@libsdl.org>
date Wed, 15 Mar 2006 15:39:29 +0000
parents bb5ace455586
children 782fd950bd46 c121d94672cb 39b9405d3cb6
line wrap: on
line source

/*
   Header definitions for the x86 routines for the HERMES library
   Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at)
   This source code is licensed under the GNU LGPL
  
   Please refer to the file COPYING.LIB contained in the distribution for
   licensing conditions
*/

#ifndef __HERMES_HEAD_X86__
#define __HERMES_HEAD_X86__


#ifdef X86_ASSEMBLER

/* If you can't stand IFDEFS, then close your eyes now, please :) */

/* Ok, we start with normal function definitions */
#ifdef __cplusplus
extern "C" {
#endif


void STACKCALL ConvertX86(HermesConverterInterface *);
void STACKCALL ClearX86_32(HermesClearInterface *);
void STACKCALL ClearX86_24(HermesClearInterface *);
void STACKCALL ClearX86_16(HermesClearInterface *);
void STACKCALL ClearX86_8(HermesClearInterface *);

int STACKCALL Hermes_X86_CPU();

void ConvertX86p32_32BGR888();
void ConvertX86p32_32RGBA888();
void ConvertX86p32_32BGRA888();
void ConvertX86p32_24RGB888();
void ConvertX86p32_24BGR888();
void ConvertX86p32_16RGB565();
void ConvertX86p32_16BGR565();
void ConvertX86p32_16RGB555();
void ConvertX86p32_16BGR555();
void ConvertX86p32_8RGB332();

void ConvertX86p16_32RGB888();
void ConvertX86p16_32BGR888();
void ConvertX86p16_32RGBA888();
void ConvertX86p16_32BGRA888();
void ConvertX86p16_24RGB888();
void ConvertX86p16_24BGR888();
void ConvertX86p16_16BGR565();
void ConvertX86p16_16RGB555();
void ConvertX86p16_16BGR555();
void ConvertX86p16_8RGB332();

void CopyX86p_4byte();
void CopyX86p_3byte();
void CopyX86p_2byte();
void CopyX86p_1byte();

void ConvertX86pI8_32();
void ConvertX86pI8_24();
void ConvertX86pI8_16();

extern int ConvertX86p16_32RGB888_LUT_X86[512];
extern int ConvertX86p16_32BGR888_LUT_X86[512];
extern int ConvertX86p16_32RGBA888_LUT_X86[512];
extern int ConvertX86p16_32BGRA888_LUT_X86[512];
  
#ifdef __cplusplus
}
#endif




/* Now fix up the ELF underscore problem */

#if defined(__ELF__) && defined(__GNUC__)
  #ifdef __cplusplus
  extern "C" {
  #endif

  extern int _Hermes_X86_CPU();

  extern void _ConvertX86(HermesConverterInterface *);

  extern void _ConvertX86p32_32BGR888();
  extern void _ConvertX86p32_32RGBA888();
  extern void _ConvertX86p32_32BGRA888();
  extern void _ConvertX86p32_24RGB888();
  extern void _ConvertX86p32_24BGR888();
  extern void _ConvertX86p32_16RGB565();
  extern void _ConvertX86p32_16BGR565();
  extern void _ConvertX86p32_16RGB555();
  extern void _ConvertX86p32_16BGR555();
  extern void _ConvertX86p32_8RGB332();

  extern void _ConvertX86p16_16BGR565();
  extern void _ConvertX86p16_16RGB555();
  extern void _ConvertX86p16_16BGR555();
  extern void _ConvertX86p16_8RGB332();


  #define Hermes_X86_CPU _Hermes_X86_CPU

  #define ConvertX86 _ConvertX86

  #define ConvertX86p32_32BGR888 _ConvertX86p32_32BGR888
  #define ConvertX86p32_32RGBA888 _ConvertX86p32_32RGBA888
  #define ConvertX86p32_32BGRA888 _ConvertX86p32_32BGRA888
  #define ConvertX86p32_24RGB888 _ConvertX86p32_24RGB888
  #define ConvertX86p32_24BGR888 _ConvertX86p32_24BGR888
  #define ConvertX86p32_16RGB565 _ConvertX86p32_16RGB565
  #define ConvertX86p32_16BGR565 _ConvertX86p32_16BGR565
  #define ConvertX86p32_16RGB555 _ConvertX86p32_16RGB555
  #define ConvertX86p32_16BGR555 _ConvertX86p32_16BGR555
  #define ConvertX86p32_8RGB332 _ConvertX86p32_8RGB332

  #define ConvertX86p16_16BGR565 _ConvertX86p16_16BGR565
  #define ConvertX86p16_16RGB555 _ConvertX86p16_16RGB555
  #define ConvertX86p16_16BGR555 _ConvertX86p16_16BGR555
  #define ConvertX86p16_8RGB332 _ConvertX86p16_8RGB332


  #ifdef __cplusplus
  }
  #endif

#endif /* ELF & GNU */



/* Make it run with WATCOM C */
#ifdef __WATCOMC__
#pragma warning 601 9

#pragma aux Hermes_X86_CPU "_*"

#pragma aux ConvertX86 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearX86_32 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearX86_24 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearX86_16 "_*" modify [EAX EBX ECX EDX ESI EDI]
#pragma aux ClearX86_8 "_*" modify [EAX EBX ECX EDX ESI EDI]

#pragma aux ConvertX86p32_32BGR888 "_*"
#pragma aux ConvertX86p32_32RGBA888 "_*"
#pragma aux ConvertX86p32_32BGRA888 "_*"
#pragma aux ConvertX86p32_24RGB888 "_*"
#pragma aux ConvertX86p32_24BGR888 "_*"
#pragma aux ConvertX86p32_16RGB565 "_*"
#pragma aux ConvertX86p32_16BGR565 "_*"
#pragma aux ConvertX86p32_16RGB555 "_*"
#pragma aux ConvertX86p32_16BGR555 "_*"
#pragma aux ConvertX86p32_8RGB332 "_*"

#pragma aux ConvertX86p16_32RGB888 "_*"
#pragma aux ConvertX86p16_32BGR888 "_*"
#pragma aux ConvertX86p16_32RGBA888 "_*"
#pragma aux ConvertX86p16_32BGRA888 "_*"
#pragma aux ConvertX86p16_24RGB888 "_*"
#pragma aux ConvertX86p16_24BGR888 "_*"
#pragma aux ConvertX86p16_16BGR565 "_*"
#pragma aux ConvertX86p16_16RGB555 "_*"
#pragma aux ConvertX86p16_16BGR555 "_*"
#pragma aux ConvertX86p16_8RGB332 "_*"

#pragma aux CopyX86p_4byte "_*"
#pragma aux CopyX86p_3byte "_*"
#pragma aux CopyX86p_2byte "_*"
#pragma aux CopyX86p_1byte "_*"

#pragma aux ConvertX86pI8_32 "_*"
#pragma aux ConvertX86pI8_24 "_*"
#pragma aux ConvertX86pI8_16 "_*"

#pragma aux ConvertX86p16_32RGB888_LUT_X86 "_*"
#pragma aux ConvertX86p16_32BGR888_LUT_X86 "_*"
#pragma aux ConvertX86p16_32RGBA888_LUT_X86 "_*"
#pragma aux ConvertX86p16_32BGRA888_LUT_X86 "_*"

#endif /* __WATCOMC__ */


#endif /* X86_ASSEMBLER */


#endif