# HG changeset patch
# User Sam Lantinga <slouken@libsdl.org>
# Date 1187243784 0
# Node ID 5a58b57b6724c1382b89197220708b4dff1c3456
# Parent  5cd2a2293cf0d90d0b8208170f43088d07a9c1ed
Added SSE and MMX optimization for SDL_FillRect()

diff -r 5cd2a2293cf0 -r 5a58b57b6724 src/video/SDL_blit.c
--- a/src/video/SDL_blit.c	Thu Aug 16 02:14:13 2007 +0000
+++ b/src/video/SDL_blit.c	Thu Aug 16 05:56:24 2007 +0000
@@ -110,7 +110,8 @@
 #ifdef __MACOSX__
 #include <sys/sysctl.h>
 
-static SDL_bool SDL_UseAltivecPrefetch()
+static SDL_bool
+SDL_UseAltivecPrefetch()
 {
     const char key[] = "hw.l3cachesize";
     u_int64_t result = 0;
@@ -123,14 +124,16 @@
     }
 }
 #else
-static SDL_bool SDL_UseAltivecPrefetch()
+static SDL_bool
+SDL_UseAltivecPrefetch()
 {
     /* Just guess G4 */
     return SDL_TRUE;
 }
 #endif /* __MACOSX__ */
 
-static SDL_loblit SDL_ChooseBlitFunc(SDL_BlitEntry *entries, int count)
+static SDL_loblit
+SDL_ChooseBlitFunc(SDL_BlitEntry * entries, int count)
 {
     int i;
     static Uint32 features = 0xffffffff;
diff -r 5cd2a2293cf0 -r 5a58b57b6724 src/video/SDL_blit.h
--- a/src/video/SDL_blit.h	Thu Aug 16 02:14:13 2007 +0000
+++ b/src/video/SDL_blit.h	Thu Aug 16 05:56:24 2007 +0000
@@ -24,6 +24,13 @@
 #ifndef _SDL_blit_h
 #define _SDL_blit_h
 
+#ifdef __MMX__
+#include <mmintrin.h>
+#endif
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+
 #include "SDL_endian.h"
 
 /* The structure passed to the low level blit functions */
@@ -92,6 +99,14 @@
  * Useful macros for blitting routines
  */
 
+#if defined(__GNUC__)
+#define DECLARE_ALIGNED(t,v,a)  t __attribute__((aligned(a))) v
+#elif defined(_MSC_VER)
+#define DECLARE_ALIGNED(t,v,a)  t __declspec(align(a)) v
+#else
+#define DECLARE_ALIGNED(t,v,a)  t v
+#endif
+
 #define FORMAT_EQUAL(A, B)						\
     ((A)->BitsPerPixel == (B)->BitsPerPixel				\
      && ((A)->Rmask == (B)->Rmask) && ((A)->Amask == (B)->Amask))
diff -r 5cd2a2293cf0 -r 5a58b57b6724 src/video/SDL_blit_copy.c
--- a/src/video/SDL_blit_copy.c	Thu Aug 16 02:14:13 2007 +0000
+++ b/src/video/SDL_blit_copy.c	Thu Aug 16 05:56:24 2007 +0000
@@ -23,13 +23,8 @@
 
 #include "SDL_video.h"
 #include "SDL_blit.h"
+#include "SDL_blit_copy.h"
 
-#ifdef __MMX__
-#include <mmintrin.h>
-#endif
-#ifdef __SSE__
-#include <xmmintrin.h>
-#endif
 
 #ifdef __MMX__
 static __inline__ void
diff -r 5cd2a2293cf0 -r 5a58b57b6724 src/video/SDL_surface.c
--- a/src/video/SDL_surface.c	Thu Aug 16 02:14:13 2007 +0000
+++ b/src/video/SDL_surface.c	Thu Aug 16 05:56:24 2007 +0000
@@ -509,20 +509,220 @@
     return 0;
 }
 
-static int
-SDL_FillRect1(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
-{
-    /* FIXME: We have to worry about packing order.. *sigh* */
-    SDL_SetError("1-bpp rect fill not yet implemented");
-    return -1;
+#ifdef __SSE__
+/* *INDENT-OFF* */
+
+#define SSE_BEGIN \
+    DECLARE_ALIGNED(Uint32, cccc[4], 16); \
+    cccc[0] = color; \
+    cccc[1] = color; \
+    cccc[2] = color; \
+    cccc[3] = color; \
+    __m128 c128 = *(__m128 *)cccc;
+
+#define SSE_WORK \
+    for (i = n / 64; i--;) { \
+        _mm_stream_ps((float *)(p+0), c128); \
+        _mm_stream_ps((float *)(p+16), c128); \
+        _mm_stream_ps((float *)(p+32), c128); \
+        _mm_stream_ps((float *)(p+48), c128); \
+        p += 64; \
+    }
+
+#define SSE_END
+
+#define DEFINE_SSE_FILLRECT(bpp, type) \
+static void \
+SDL_FillRect##bpp##SSE(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
+{ \
+    SSE_BEGIN; \
+ \
+    while (h--) { \
+        int i, n = w * bpp; \
+        Uint8 *p = pixels; \
+ \
+        if (n > 15) { \
+            int adjust = 16 - ((uintptr_t)p & 15); \
+            if (adjust < 16) { \
+                n -= adjust; \
+                adjust /= bpp; \
+                while(adjust--) { \
+                    *((type *)p) = (type)color; \
+                    p += bpp; \
+                } \
+            } \
+            SSE_WORK; \
+        } \
+        if (n & 63) { \
+            int remainder = (n & 63); \
+            remainder /= bpp; \
+            while(remainder--) { \
+                *((type *)p) = (type)color; \
+                p += bpp; \
+            } \
+        } \
+        pixels += pitch; \
+    } \
+ \
+    SSE_END; \
 }
 
-static int
-SDL_FillRect4(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
+DEFINE_SSE_FILLRECT(1, Uint8)
+DEFINE_SSE_FILLRECT(2, Uint16)
+DEFINE_SSE_FILLRECT(4, Uint32)
+
+/* *INDENT-ON* */
+#endif /* __SSE__ */
+
+#ifdef __MMX__
+/* *INDENT-OFF* */
+
+#define MMX_BEGIN \
+    __m64 c64 = _mm_set_pi32(color, color)
+
+#define MMX_WORK \
+    for (i = n / 64; i--;) { \
+        _mm_stream_pi((__m64 *)(p+0), c64); \
+        _mm_stream_pi((__m64 *)(p+8), c64); \
+        _mm_stream_pi((__m64 *)(p+16), c64); \
+        _mm_stream_pi((__m64 *)(p+24), c64); \
+        _mm_stream_pi((__m64 *)(p+32), c64); \
+        _mm_stream_pi((__m64 *)(p+40), c64); \
+        _mm_stream_pi((__m64 *)(p+48), c64); \
+        _mm_stream_pi((__m64 *)(p+56), c64); \
+        p += 64; \
+    }
+
+#define MMX_END \
+    _mm_empty()
+
+#define DEFINE_MMX_FILLRECT(bpp, type) \
+static void \
+SDL_FillRect##bpp##MMX(Uint8 *pixels, int pitch, Uint32 color, int w, int h) \
+{ \
+    MMX_BEGIN; \
+ \
+    while (h--) { \
+        int i, n = w * bpp; \
+        Uint8 *p = pixels; \
+ \
+        if (n > 7) { \
+            int adjust = 8 - ((uintptr_t)p & 7); \
+            if (adjust < 8) { \
+                n -= adjust; \
+                adjust /= bpp; \
+                while(adjust--) { \
+                    *((type *)p) = (type)color; \
+                    p += bpp; \
+                } \
+            } \
+            MMX_WORK; \
+        } \
+        if (n & 63) { \
+            int remainder = (n & 63); \
+            remainder /= bpp; \
+            while(remainder--) { \
+                *((type *)p) = (type)color; \
+                p += bpp; \
+            } \
+        } \
+        pixels += pitch; \
+    } \
+ \
+    MMX_END; \
+}
+
+DEFINE_MMX_FILLRECT(1, Uint8)
+DEFINE_MMX_FILLRECT(2, Uint16)
+DEFINE_MMX_FILLRECT(4, Uint32)
+
+/* *INDENT-ON* */
+#endif /* __MMX__ */
+
+static void
+SDL_FillRect1(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
 {
-    /* FIXME: We have to worry about packing order.. *sigh* */
-    SDL_SetError("4-bpp rect fill not yet implemented");
-    return -1;
+    while (h--) {
+        int n = w;
+        Uint8 *p = pixels;
+
+        if (n > 3) {
+            switch ((uintptr_t) p & 3) {
+            case 1:
+                *p++ = (Uint8) color;
+                --n;
+            case 2:
+                *p++ = (Uint8) color;
+                --n;
+            case 3:
+                *p++ = (Uint8) color;
+                --n;
+            }
+            SDL_memset4(p, color, (n >> 2));
+        }
+        if (n & 3) {
+            p += (n & ~3);
+            switch (n & 3) {
+            case 3:
+                *p++ = (Uint8) color;
+            case 2:
+                *p++ = (Uint8) color;
+            case 1:
+                *p++ = (Uint8) color;
+            }
+        }
+        pixels += pitch;
+    }
+}
+
+static void
+SDL_FillRect2(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
+{
+    while (h--) {
+        int n = w;
+        Uint16 *p = (Uint16 *) pixels;
+
+        if (n > 1) {
+            if ((uintptr_t) p & 2) {
+                *p++ = (Uint16) color;
+                --n;
+            }
+            SDL_memset4(p, color, (n >> 1));
+        }
+        if (n & 1) {
+            p[n - 1] = (Uint16) color;
+        }
+        pixels += pitch;
+    }
+}
+
+static void
+SDL_FillRect3(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
+{
+    Uint8 r = (Uint8) (color & 0xFF);
+    Uint8 g = (Uint8) ((color >> 8) & 0xFF);
+    Uint8 b = (Uint8) ((color >> 16) & 0xFF);
+
+    while (h--) {
+        int n = w;
+        Uint8 *p = pixels;
+
+        while (n--) {
+            *p++ = r;
+            *p++ = g;
+            *p++ = b;
+        }
+        pixels += pitch;
+    }
+}
+
+static void
+SDL_FillRect4(Uint8 * pixels, int pitch, Uint32 color, int w, int h)
+{
+    while (h--) {
+        SDL_memset4(pixels, color, w);
+        pixels += pitch;
+    }
 }
 
 /* 
@@ -531,23 +731,12 @@
 int
 SDL_FillRect(SDL_Surface * dst, SDL_Rect * dstrect, Uint32 color)
 {
-    int x, y;
-    Uint8 *row;
+    Uint8 *pixels;
 
     /* This function doesn't work on surfaces < 8 bpp */
     if (dst->format->BitsPerPixel < 8) {
-        switch (dst->format->BitsPerPixel) {
-        case 1:
-            return SDL_FillRect1(dst, dstrect, color);
-            break;
-        case 4:
-            return SDL_FillRect4(dst, dstrect, color);
-            break;
-        default:
-            SDL_SetError("Fill rect on unsupported surface format");
-            return (-1);
-            break;
-        }
+        SDL_SetError("Fill rect on unsupported surface format");
+        return (-1);
     }
 
     /* If 'dstrect' == NULL, then fill the whole surface */
@@ -564,97 +753,83 @@
     if (SDL_LockSurface(dst) != 0) {
         return (-1);
     }
-    row = (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
+
+    pixels =
+        (Uint8 *) dst->pixels + dstrect->y * dst->pitch +
         dstrect->x * dst->format->BytesPerPixel;
-    if (dst->format->palette || (color == 0)) {
-        x = dstrect->w * dst->format->BytesPerPixel;
-#ifndef __MACOSX__              /* memset() is optimized on Mac OS X */
-        if (!color && !((uintptr_t) row & 3) && !(x & 3)
-            && !(dst->pitch & 3)) {
-            int n = x >> 2;
-            for (y = dstrect->h; y; --y) {
-                SDL_memset4(row, 0, n);
-                row += dst->pitch;
-            }
-        } else
-#endif /* !__MACOSX__ */
+
+    switch (dst->format->BytesPerPixel) {
+    case 1:
         {
-            for (y = dstrect->h; y; y--) {
-                SDL_memset(row, color, x);
-                row += dst->pitch;
+            color |= (color << 8);
+            color |= (color << 16);
+#ifdef __SSE__
+            if (SDL_HasSSE()) {
+                SDL_FillRect1SSE(pixels, dst->pitch, color, dstrect->w,
+                                 dstrect->h);
+                break;
             }
-        }
-    } else {
-        switch (dst->format->BytesPerPixel) {
-        case 2:
-            {
-                Uint16 c = (Uint16) color;
-                Uint32 cc = (Uint32) c << 16 | c;
-                for (y = dstrect->h; y; --y) {
-                    Uint16 *pixels = (Uint16 *) row;
-                    int n = dstrect->w;
-                    if ((uintptr_t) pixels & 3) {
-                        *pixels++ = c;
-                        n--;
-                    }
-                    if (n >> 1)
-                        SDL_memset4(pixels, cc, n >> 1);
-                    if (n & 1)
-                        pixels[n - 1] = c;
-                    row += dst->pitch;
-                }
+#endif
+#ifdef __MMX__
+            if (SDL_HasMMX()) {
+                SDL_FillRect1MMX(pixels, dst->pitch, color, dstrect->w,
+                                 dstrect->h);
+                break;
             }
+#endif
+            SDL_FillRect1(pixels, dst->pitch, color, dstrect->w, dstrect->h);
             break;
-
-        case 3:
-#if SDL_BYTEORDER == SDL_BIG_ENDIAN
-            color <<= 8;
-#endif
-            for (y = dstrect->h; y; --y) {
-                Uint8 *pixels = row;
-                for (x = dstrect->w; x; --x) {
-                    SDL_memcpy(pixels, &color, 3);
-                    pixels += 3;
-                }
-                row += dst->pitch;
-            }
-            break;
+        }
 
-        case 4:
-#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
-            if (SDL_HasSSE() && !((uintptr_t) row & 15) && !(dstrect->w & 3)) {
-                Uint32 cccc[4] __attribute__ ((aligned(16))) = {
-                color, color, color, color};
-                int i, n = dstrect->w / 4;
-                __asm__ __volatile__("	movdqa (%0), %%xmm0\n"::
-                                     "r"(cccc):"memory");
-                for (y = dstrect->h; y; --y) {
-                    Uint8 *pixels = row;
-                    for (i = n / 2; i--;) {
-                        /* *INDENT-OFF* */
-                        __asm__ __volatile__("	prefetchnta 256(%0)\n"
-                                             "	movdqa %%xmm0, (%0)\n"
-                                             "	movdqa %%xmm0, 16(%0)\n"::"r"(pixels):"memory");
-                        /* *INDENT-ON* */
-                        pixels += 32;
-                    }
-                    if (n & 1) {
-                        __asm__ __volatile__("	movdqa %%xmm0, (%0)\n"::
-                                             "r"(pixels):"memory");
-                    }
-                    row += dst->pitch;
-                }
-                __asm__ __volatile__("	emms\n"::);
+    case 2:
+        {
+            color |= (color << 16);
+#ifdef __SSE__
+            if (SDL_HasSSE()) {
+                SDL_FillRect2SSE(pixels, dst->pitch, color, dstrect->w,
+                                 dstrect->h);
                 break;
             }
 #endif
-            for (y = dstrect->h; y; --y) {
-                SDL_memset4(row, color, dstrect->w);
-                row += dst->pitch;
+#ifdef __MMX__
+            if (SDL_HasMMX()) {
+                SDL_FillRect2MMX(pixels, dst->pitch, color, dstrect->w,
+                                 dstrect->h);
+                break;
             }
+#endif
+            SDL_FillRect2(pixels, dst->pitch, color, dstrect->w, dstrect->h);
+            break;
+        }
+
+    case 3:
+        /* 24-bit RGB is a slow path, at least for now. */
+        {
+            SDL_FillRect3(pixels, dst->pitch, color, dstrect->w, dstrect->h);
+            break;
+        }
+
+    case 4:
+        {
+#ifdef __SSE__
+            if (SDL_HasSSE()) {
+                SDL_FillRect4SSE(pixels, dst->pitch, color, dstrect->w,
+                                 dstrect->h);
+                break;
+            }
+#endif
+#ifdef __MMX__
+            if (SDL_HasMMX()) {
+                SDL_FillRect4MMX(pixels, dst->pitch, color, dstrect->w,
+                                 dstrect->h);
+                break;
+            }
+#endif
+            SDL_FillRect4(pixels, dst->pitch, color, dstrect->w, dstrect->h);
             break;
         }
     }
+
     SDL_UnlockSurface(dst);
 
     /* We're done! */