diff src/video/SDL_blit_A.c @ 1047:ffaaf7ecf685

Altivec-optimized blitters! Vast majority of this work is compliments of Bob Ippolito. http://www.devolution.com/pipermail/sdl/2005-February/067466.html and many other posts.
author Ryan C. Gordon <icculus@icculus.org>
date Sun, 17 Apr 2005 10:19:22 +0000
parents 9ef41050100c
children 2651158f59b8
line wrap: on
line diff
--- a/src/video/SDL_blit_A.c	Sun Apr 17 10:16:30 2005 +0000
+++ b/src/video/SDL_blit_A.c	Sun Apr 17 10:19:22 2005 +0000
@@ -35,9 +35,9 @@
 #define MMX_ASMBLIT
 #endif
 
-#ifdef MMX_ASMBLIT
 /* Function to check the CPU flags */
 #include "SDL_cpuinfo.h"
+#ifdef MMX_ASMBLIT
 #include "mmx.h"
 #endif
 
@@ -421,6 +421,762 @@
 }
 #endif
 
+#ifdef USE_ALTIVEC_BLITTERS
+#include <assert.h>
+#define UNALIGNED_PTR(x) (((size_t) x) & 0x0000000F)
+#define VECPRINT(msg, v) do { \
+    vector unsigned int tmpvec = (vector unsigned int)(v); \
+    unsigned int *vp = (unsigned int *)&tmpvec; \
+    printf("%s = %08X %08X %08X %08X\n", msg, vp[0], vp[1], vp[2], vp[3]); \
+} while (0)
+
+/* the permuation vector that takes the high bytes out of all the appropriate shorts 
+    (vector unsigned char)(
+        0x00, 0x10, 0x02, 0x12,
+        0x04, 0x14, 0x06, 0x16,
+        0x08, 0x18, 0x0A, 0x1A,
+        0x0C, 0x1C, 0x0E, 0x1E );
+*/
+#define VEC_MERGE_PERMUTE() (vec_add(vec_lvsl(0, (int*)NULL), (vector unsigned char)vec_splat_u16(0x0F)))
+#define VEC_U32_24() (vec_add(vec_splat_u32(12), vec_splat_u32(12)))
+#define VEC_ALPHA_MASK() ((vector unsigned char)vec_sl((vector unsigned int)vec_splat_s8(-1), VEC_U32_24()))
+#define VEC_ALIGNER(src) ((UNALIGNED_PTR(src)) \
+    ? vec_lvsl(0, src) \
+    : vec_add(vec_lvsl(8, src), vec_splat_u8(8)))
+
+   
+#define VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1_16, v8_16) do { \
+    /* vtemp1 contains source AAGGAAGGAAGGAAGG */ \
+    vector unsigned short vtemp1 = vec_mule(vs, valpha); \
+    /* vtemp2 contains source RRBBRRBBRRBBRRBB */ \
+    vector unsigned short vtemp2 = vec_mulo(vs, valpha); \
+    /* valpha2 is 255-alpha */ \
+    vector unsigned char valpha2 = vec_nor(valpha, valpha); \
+    /* vtemp3 contains dest AAGGAAGGAAGGAAGG */ \
+    vector unsigned short vtemp3 = vec_mule(vd, valpha2); \
+    /* vtemp4 contains dest RRBBRRBBRRBBRRBB */ \
+    vector unsigned short vtemp4 = vec_mulo(vd, valpha2); \
+    /* add source and dest */ \
+    vtemp1 = vec_add(vtemp1, vtemp3); \
+    vtemp2 = vec_add(vtemp2, vtemp4); \
+    /* vtemp1 = (vtemp1 + 1) + ((vtemp1 + 1) >> 8) */ \
+    vtemp1 = vec_add(vtemp1, v1_16); \
+    vtemp3 = vec_sr(vtemp1, v8_16); \
+    vtemp1 = vec_add(vtemp1, vtemp3); \
+    /* vtemp2 = (vtemp2 + 1) + ((vtemp2 + 1) >> 8) */ \
+    vtemp2 = vec_add(vtemp2, v1_16); \
+    vtemp4 = vec_sr(vtemp2, v8_16); \
+    vtemp2 = vec_add(vtemp2, vtemp4); \
+    /* (>>8) and get ARGBARGBARGBARGB */ \
+    vd = (vector unsigned char)vec_perm(vtemp1, vtemp2, mergePermute); \
+} while (0)
+ 
+/* Calculate the permute vector used for 32->32 swizzling */
+static vector unsigned char calc_swizzle32(const SDL_PixelFormat *srcfmt,
+                                  const SDL_PixelFormat *dstfmt)
+{
+    /*
+     * We have to assume that the bits that aren't used by other
+     *  colors is alpha, and it's one complete byte, since some formats
+     *  leave alpha with a zero mask, but we should still swizzle the bits.
+     */
+    /* ARGB */
+    const static struct SDL_PixelFormat default_pixel_format = {
+        NULL, 0, 0,
+        0, 0, 0, 0,
+        16, 8, 0, 24,
+        0x00FF0000, 0x0000FF00, 0x000000FF, 0xFF000000,
+        0, 0};
+    if (!srcfmt) {
+        srcfmt = &default_pixel_format;
+    }
+    if (!dstfmt) {
+        dstfmt = &default_pixel_format;
+    }
+    vector unsigned char plus = (vector unsigned char)
+                                            ( 0x00, 0x00, 0x00, 0x00,
+                                              0x04, 0x04, 0x04, 0x04,
+                                              0x08, 0x08, 0x08, 0x08,
+                                              0x0C, 0x0C, 0x0C, 0x0C );
+    vector unsigned char vswiz;
+    vector unsigned int srcvec;
+#define RESHIFT(X) (3 - ((X) >> 3))
+    Uint32 rmask = RESHIFT(srcfmt->Rshift) << (dstfmt->Rshift);
+    Uint32 gmask = RESHIFT(srcfmt->Gshift) << (dstfmt->Gshift);
+    Uint32 bmask = RESHIFT(srcfmt->Bshift) << (dstfmt->Bshift);
+    Uint32 amask;
+    /* Use zero for alpha if either surface doesn't have alpha */
+    if (dstfmt->Amask) {
+        amask = ((srcfmt->Amask) ? RESHIFT(srcfmt->Ashift) : 0x10) << (dstfmt->Ashift);
+    } else {
+        amask = 0x10101010 & ((dstfmt->Rmask | dstfmt->Gmask | dstfmt->Bmask) ^ 0xFFFFFFFF);
+    }
+#undef RESHIFT  
+    ((unsigned int *)&srcvec)[0] = (rmask | gmask | bmask | amask);
+    vswiz = vec_add(plus, (vector unsigned char)vec_splat(srcvec, 0));
+    return(vswiz);
+}
+
+static void Blit32to565PixelAlphaAltivec(SDL_BlitInfo *info)
+{
+    int height = info->d_height;
+    Uint8 *src = (Uint8 *)info->s_pixels;
+    int srcskip = info->s_skip;
+    Uint8 *dst = (Uint8 *)info->d_pixels;
+    int dstskip = info->d_skip;
+    SDL_PixelFormat *srcfmt = info->src;
+
+    vector unsigned char v0 = vec_splat_u8(0);
+    vector unsigned short v8_16 = vec_splat_u16(8);
+    vector unsigned short v1_16 = vec_splat_u16(1);
+    vector unsigned short v2_16 = vec_splat_u16(2);
+    vector unsigned short v3_16 = vec_splat_u16(3);
+    vector unsigned int v8_32 = vec_splat_u32(8);
+    vector unsigned int v16_32 = vec_add(v8_32, v8_32);
+    vector unsigned short v3f = (vector unsigned short)(
+        0x003f, 0x003f, 0x003f, 0x003f,
+        0x003f, 0x003f, 0x003f, 0x003f);
+    vector unsigned short vfc = (vector unsigned short)(
+        0x00fc, 0x00fc, 0x00fc, 0x00fc,
+        0x00fc, 0x00fc, 0x00fc, 0x00fc);
+
+    /* 
+        0x10 - 0x1f is the alpha
+        0x00 - 0x0e evens are the red
+        0x01 - 0x0f odds are zero
+    */
+    vector unsigned char vredalpha1 = (vector unsigned char)(
+        0x10, 0x00, 0x01, 0x01,
+        0x10, 0x02, 0x01, 0x01,
+        0x10, 0x04, 0x01, 0x01,
+        0x10, 0x06, 0x01, 0x01
+    );
+    vector unsigned char vredalpha2 = (vector unsigned char)(
+        vec_add((vector unsigned int)vredalpha1, vec_sl(v8_32, v16_32))
+    );
+    /*
+        0x00 - 0x0f is ARxx ARxx ARxx ARxx
+        0x11 - 0x0f odds are blue
+    */
+    vector unsigned char vblue1 = (vector unsigned char)(
+        0x00, 0x01, 0x02, 0x11,
+        0x04, 0x05, 0x06, 0x13,
+        0x08, 0x09, 0x0a, 0x15,
+        0x0c, 0x0d, 0x0e, 0x17
+    );
+    vector unsigned char vblue2 = (vector unsigned char)(
+        vec_add((vector unsigned int)vblue1, v8_32)
+    );
+    /*
+        0x00 - 0x0f is ARxB ARxB ARxB ARxB
+        0x10 - 0x0e evens are green
+    */
+    vector unsigned char vgreen1 = (vector unsigned char)(
+        0x00, 0x01, 0x10, 0x03,
+        0x04, 0x05, 0x12, 0x07,
+        0x08, 0x09, 0x14, 0x0b,
+        0x0c, 0x0d, 0x16, 0x0f
+    );
+    vector unsigned char vgreen2 = (vector unsigned char)(
+        vec_add((vector unsigned int)vgreen1, vec_sl(v8_32, v8_32))
+    );
+    vector unsigned char vgmerge = (vector unsigned char)(
+        0x00, 0x02, 0x00, 0x06,
+        0x00, 0x0a, 0x00, 0x0e,
+        0x00, 0x12, 0x00, 0x16,
+        0x00, 0x1a, 0x00, 0x1e);
+    vector unsigned char mergePermute = VEC_MERGE_PERMUTE();
+    vector unsigned char vpermute = calc_swizzle32(srcfmt, NULL);
+    vector unsigned char valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
+
+    vector unsigned short vf800 = (vector unsigned short)vec_splat_u8(-7);
+    vf800 = vec_sl(vf800, vec_splat_u16(8));
+
+    while(height--) {
+        int extrawidth;
+        vector unsigned char valigner;
+        vector unsigned char vsrc;
+        vector unsigned char voverflow;
+        int width = info->d_width;
+
+#define ONE_PIXEL_BLEND(condition, widthvar) \
+        while (condition) { \
+            Uint32 pixel; \
+            unsigned sR, sG, sB, dR, dG, dB, sA; \
+            DISEMBLE_RGBA(src, 4, srcfmt, pixel, sR, sG, sB, sA); \
+            if(sA) { \
+                unsigned short dstpixel = *((unsigned short *)dst); \
+                dR = (dstpixel >> 8) & 0xf8; \
+                dG = (dstpixel >> 3) & 0xfc; \
+                dB = (dstpixel << 3) & 0xf8; \
+                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
+                *((unsigned short *)dst) = ( \
+                    ((dR & 0xf8) << 8) | ((dG & 0xfc) << 3) | (dB >> 3) \
+                ); \
+            } \
+            src += 4; \
+            dst += 2; \
+            widthvar--; \
+        }
+        ONE_PIXEL_BLEND((UNALIGNED_PTR(dst)) && (width), width);
+        extrawidth = (width % 8);
+        valigner = VEC_ALIGNER(src);
+        vsrc = (vector unsigned char)vec_ld(0, src);
+        width -= extrawidth;
+        while (width) {
+            vector unsigned char valpha;
+            vector unsigned char vsrc1, vsrc2;
+            vector unsigned char vdst1, vdst2;
+            vector unsigned short vR, vG, vB;
+            vector unsigned short vpixel, vrpixel, vgpixel, vbpixel;
+
+            /* Load 8 pixels from src as ARGB */
+            voverflow = (vector unsigned char)vec_ld(15, src);
+            vsrc = vec_perm(vsrc, voverflow, valigner);
+            vsrc1 = vec_perm(vsrc, vsrc, vpermute);
+            src += 16;
+            vsrc = (vector unsigned char)vec_ld(15, src);
+            voverflow = vec_perm(voverflow, vsrc, valigner);
+            vsrc2 = vec_perm(voverflow, voverflow, vpermute);
+            src += 16;
+
+            /* Load 8 pixels from dst as XRGB */
+            voverflow = vec_ld(0, dst);
+            vR = vec_and((vector unsigned short)voverflow, vf800);
+            vB = vec_sl((vector unsigned short)voverflow, v3_16);
+            vG = vec_sl(vB, v2_16);
+            vdst1 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha1);
+            vdst1 = vec_perm(vdst1, (vector unsigned char)vB, vblue1);
+            vdst1 = vec_perm(vdst1, (vector unsigned char)vG, vgreen1);
+            vdst2 = (vector unsigned char)vec_perm((vector unsigned char)vR, (vector unsigned char)vR, vredalpha2);
+            vdst2 = vec_perm(vdst2, (vector unsigned char)vB, vblue2);
+            vdst2 = vec_perm(vdst2, (vector unsigned char)vG, vgreen2);
+
+            /* Alpha blend 8 pixels as ARGB */
+            valpha = vec_perm(vsrc1, v0, valphaPermute);
+            VEC_MULTIPLY_ALPHA(vsrc1, vdst1, valpha, mergePermute, v1_16, v8_16);
+            valpha = vec_perm(vsrc2, v0, valphaPermute);
+            VEC_MULTIPLY_ALPHA(vsrc2, vdst2, valpha, mergePermute, v1_16, v8_16);
+
+            /* Convert 8 pixels to 565 */
+            vpixel = (vector unsigned short)vec_packpx((vector unsigned int)vdst1, (vector unsigned int)vdst2);
+            vgpixel = (vector unsigned short)vec_perm(vdst1, vdst2, vgmerge);
+            vgpixel = vec_and(vgpixel, vfc);
+            vgpixel = vec_sl(vgpixel, v3_16);
+            vrpixel = vec_sl(vpixel, v1_16);
+            vrpixel = vec_and(vrpixel, vf800);
+            vbpixel = vec_and(vpixel, v3f);
+            vdst1 = vec_or((vector unsigned char)vrpixel, (vector unsigned char)vgpixel);
+            vdst1 = vec_or(vdst1, (vector unsigned char)vbpixel);
+            
+            /* Store 8 pixels */
+            vec_st(vdst1, 0, dst);
+
+            width -= 8;
+            dst += 16;
+        }
+        ONE_PIXEL_BLEND((extrawidth), extrawidth);
+#undef ONE_PIXEL_BLEND
+        src += srcskip;
+        dst += dstskip;
+    }
+}
+
+static void Blit32to32SurfaceAlphaKeyAltivec(SDL_BlitInfo *info)
+{
+    unsigned alpha = info->src->alpha;
+    int height = info->d_height;
+    Uint32 *srcp = (Uint32 *)info->s_pixels;
+    int srcskip = info->s_skip >> 2;
+    Uint32 *dstp = (Uint32 *)info->d_pixels;
+    int dstskip = info->d_skip >> 2;
+    SDL_PixelFormat *srcfmt = info->src;
+    SDL_PixelFormat *dstfmt = info->dst;
+    unsigned sA = srcfmt->alpha;
+    unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
+    Uint32 rgbmask = srcfmt->Rmask | srcfmt->Gmask | srcfmt->Bmask;
+    Uint32 ckey = info->src->colorkey;
+    vector unsigned char mergePermute;
+    vector unsigned char vsrcPermute;
+    vector unsigned char vdstPermute;
+    vector unsigned char vsdstPermute;
+    vector unsigned char valpha;
+    vector unsigned char valphamask;
+    vector unsigned char vbits;
+    vector unsigned char v0;
+    vector unsigned short v1;
+    vector unsigned short v8;
+    vector unsigned int vckey;
+    vector unsigned int vrgbmask;
+
+    mergePermute = VEC_MERGE_PERMUTE();
+    v0 = vec_splat_u8(0);
+    v1 = vec_splat_u16(1);
+    v8 = vec_splat_u16(8);
+
+    /* set the alpha to 255 on the destination surf */
+    valphamask = VEC_ALPHA_MASK();
+
+    vsrcPermute = calc_swizzle32(srcfmt, NULL);
+    vdstPermute = calc_swizzle32(NULL, dstfmt);
+    vsdstPermute = calc_swizzle32(dstfmt, NULL);
+
+    /* set a vector full of alpha and 255-alpha */
+    ((unsigned char *)&valpha)[0] = alpha;
+    valpha = vec_splat(valpha, 0);
+    vbits = (vector unsigned char)vec_splat_s8(-1);
+
+    ckey &= rgbmask;
+    ((unsigned int *)&vckey)[0] = ckey;
+    vckey = vec_splat(vckey, 0);
+    ((unsigned int *)&vrgbmask)[0] = rgbmask;
+    vrgbmask = vec_splat(vrgbmask, 0);
+
+    while(height--) {
+        int width = info->d_width;
+#define ONE_PIXEL_BLEND(condition, widthvar) \
+        while (condition) { \
+            Uint32 pixel; \
+            unsigned sR, sG, sB, dR, dG, dB; \
+            RETRIEVE_RGB_PIXEL(((Uint8 *)srcp), 4, pixel); \
+            if(sA && pixel != ckey) { \
+                RGB_FROM_PIXEL(pixel, srcfmt, sR, sG, sB); \
+                DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \
+                ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
+                ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
+            } \
+            ((Uint8 *)dstp) += 4; \
+            ((Uint8 *)srcp) += 4; \
+            widthvar--; \
+        }
+        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
+        if (width > 0) {
+            int extrawidth = (width % 4);
+            vector unsigned char valigner = VEC_ALIGNER(srcp);
+            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
+            width -= extrawidth;
+            while (width) {
+                vector unsigned char vsel;
+                vector unsigned char voverflow;
+                vector unsigned char vd;
+                vector unsigned char vd_orig;
+
+                /* s = *srcp */
+                voverflow = (vector unsigned char)vec_ld(15, srcp);
+                vs = vec_perm(vs, voverflow, valigner);
+                
+                /* vsel is set for items that match the key */
+                vsel = (vector unsigned char)vec_and((vector unsigned int)vs, vrgbmask);
+                vsel = (vector unsigned char)vec_cmpeq((vector unsigned int)vsel, vckey);
+
+                /* permute to source format */
+                vs = vec_perm(vs, valpha, vsrcPermute);
+
+                /* d = *dstp */
+                vd = (vector unsigned char)vec_ld(0, dstp);
+                vd_orig = vd = vec_perm(vd, v0, vsdstPermute);
+
+                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
+
+                /* set the alpha channel to full on */
+                vd = vec_or(vd, valphamask);
+
+                /* mask out color key */
+                vd = vec_sel(vd, vd_orig, vsel);
+                
+                /* permute to dest format */
+                vd = vec_perm(vd, vbits, vdstPermute);
+
+                /* *dstp = res */
+                vec_st((vector unsigned int)vd, 0, dstp);
+                
+                srcp += 4;
+                dstp += 4;
+                width -= 4;
+                vs = voverflow;
+            }
+            ONE_PIXEL_BLEND((extrawidth), extrawidth);
+        }
+#undef ONE_PIXEL_BLEND
+ 
+        srcp += srcskip;
+        dstp += dstskip;
+    }
+}
+
+
+static void Blit32to32PixelAlphaAltivec(SDL_BlitInfo *info)
+{
+    int width = info->d_width;
+    int height = info->d_height;
+    Uint32 *srcp = (Uint32 *)info->s_pixels;
+    int srcskip = info->s_skip >> 2;
+    Uint32 *dstp = (Uint32 *)info->d_pixels;
+    int dstskip = info->d_skip >> 2;
+    SDL_PixelFormat *srcfmt = info->src;
+    SDL_PixelFormat *dstfmt = info->dst;
+    vector unsigned char mergePermute;
+    vector unsigned char valphaPermute;
+    vector unsigned char vsrcPermute;
+    vector unsigned char vdstPermute;
+    vector unsigned char vsdstPermute;
+    vector unsigned char valphamask;
+    vector unsigned char vpixelmask;
+    vector unsigned char v0;
+    vector unsigned short v1;
+    vector unsigned short v8;
+
+    v0 = vec_splat_u8(0);
+    v1 = vec_splat_u16(1);
+    v8 = vec_splat_u16(8);
+    mergePermute = VEC_MERGE_PERMUTE();
+    valphamask = VEC_ALPHA_MASK();
+    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
+    vpixelmask = vec_nor(valphamask, v0);
+    vsrcPermute = calc_swizzle32(srcfmt, NULL);
+    vdstPermute = calc_swizzle32(NULL, dstfmt);
+    vsdstPermute = calc_swizzle32(dstfmt, NULL);
+
+	while ( height-- ) {
+        width = info->d_width;
+#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
+            Uint32 pixel; \
+            unsigned sR, sG, sB, dR, dG, dB, sA, dA; \
+            DISEMBLE_RGBA((Uint8 *)srcp, 4, srcfmt, pixel, sR, sG, sB, sA); \
+            if(sA) { \
+              DISEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, pixel, dR, dG, dB, dA); \
+              ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
+              ASSEMBLE_RGBA((Uint8 *)dstp, 4, dstfmt, dR, dG, dB, dA); \
+            } \
+            ++srcp; \
+            ++dstp; \
+            widthvar--; \
+        }
+        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
+        if (width > 0) {
+            // vsrcPermute
+            // vdstPermute
+            int extrawidth = (width % 4);
+            vector unsigned char valigner = VEC_ALIGNER(srcp);
+            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
+            width -= extrawidth;
+            while (width) {
+                vector unsigned char voverflow;
+                vector unsigned char vd;
+                vector unsigned char valpha;
+                vector unsigned char vdstalpha;
+                /* s = *srcp */
+                voverflow = (vector unsigned char)vec_ld(15, srcp);
+                vs = vec_perm(vs, voverflow, valigner);
+                vs = vec_perm(vs, v0, vsrcPermute);
+
+                valpha = vec_perm(vs, v0, valphaPermute);
+                
+                /* d = *dstp */
+                vd = (vector unsigned char)vec_ld(0, dstp);
+                vd = vec_perm(vd, v0, vsdstPermute);
+                vdstalpha = vec_and(vd, valphamask);
+
+                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
+
+                /* set the alpha to the dest alpha */
+                vd = vec_and(vd, vpixelmask);
+                vd = vec_or(vd, vdstalpha);
+                vd = vec_perm(vd, v0, vdstPermute);
+
+                /* *dstp = res */
+                vec_st((vector unsigned int)vd, 0, dstp);
+                
+                srcp += 4;
+                dstp += 4;
+                width -= 4;
+                vs = voverflow;
+
+            }
+            ONE_PIXEL_BLEND((extrawidth), extrawidth);
+        }
+	    srcp += srcskip;
+	    dstp += dstskip;
+#undef ONE_PIXEL_BLEND
+	}
+}
+
+/* fast ARGB888->(A)RGB888 blending with pixel alpha */
+static void BlitRGBtoRGBPixelAlphaAltivec(SDL_BlitInfo *info)
+{
+	int width = info->d_width;
+	int height = info->d_height;
+	Uint32 *srcp = (Uint32 *)info->s_pixels;
+	int srcskip = info->s_skip >> 2;
+	Uint32 *dstp = (Uint32 *)info->d_pixels;
+	int dstskip = info->d_skip >> 2;
+    vector unsigned char mergePermute;
+    vector unsigned char valphaPermute;
+    vector unsigned char valphamask;
+    vector unsigned char vpixelmask;
+    vector unsigned char v0;
+    vector unsigned short v1;
+    vector unsigned short v8;
+    v0 = vec_splat_u8(0);
+    v1 = vec_splat_u16(1);
+    v8 = vec_splat_u16(8);
+    mergePermute = VEC_MERGE_PERMUTE();
+    valphamask = VEC_ALPHA_MASK();
+    valphaPermute = vec_and(vec_lvsl(0, (int *)NULL), vec_splat_u8(0xC));
+    
+ 
+    vpixelmask = vec_nor(valphamask, v0);
+	while(height--) {
+        width = info->d_width;
+#define ONE_PIXEL_BLEND(condition, widthvar) \
+        while ((condition)) { \
+            Uint32 dalpha; \
+            Uint32 d; \
+            Uint32 s1; \
+            Uint32 d1; \
+            Uint32 s = *srcp; \
+            Uint32 alpha = s >> 24; \
+            if(alpha) { \
+              if(alpha == SDL_ALPHA_OPAQUE) { \
+                *dstp = (s & 0x00ffffff) | (*dstp & 0xff000000); \
+              } else { \
+                d = *dstp; \
+                dalpha = d & 0xff000000; \
+                s1 = s & 0xff00ff; \
+                d1 = d & 0xff00ff; \
+                d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff; \
+                s &= 0xff00; \
+                d &= 0xff00; \
+                d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
+                *dstp = d1 | d | dalpha; \
+              } \
+            } \
+            ++srcp; \
+            ++dstp; \
+            widthvar--; \
+	    }
+        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
+        if (width > 0) {
+            int extrawidth = (width % 4);
+            vector unsigned char valigner = VEC_ALIGNER(srcp);
+            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
+            width -= extrawidth;
+            while (width) {
+                vector unsigned char voverflow;
+                vector unsigned char vd;
+                vector unsigned char valpha;
+                vector unsigned char vdstalpha;
+                /* s = *srcp */
+                voverflow = (vector unsigned char)vec_ld(15, srcp);
+                vs = vec_perm(vs, voverflow, valigner);
+
+                valpha = vec_perm(vs, v0, valphaPermute);
+                
+                /* d = *dstp */
+                vd = (vector unsigned char)vec_ld(0, dstp);
+                vdstalpha = vec_and(vd, valphamask);
+
+                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
+
+                /* set the alpha to the dest alpha */
+                vd = vec_and(vd, vpixelmask);
+                vd = vec_or(vd, vdstalpha);
+
+                /* *dstp = res */
+                vec_st((vector unsigned int)vd, 0, dstp);
+                
+                srcp += 4;
+                dstp += 4;
+                width -= 4;
+                vs = voverflow;
+            }
+            ONE_PIXEL_BLEND((extrawidth), extrawidth);
+        }
+	    srcp += srcskip;
+	    dstp += dstskip;
+	}
+#undef ONE_PIXEL_BLEND
+}
+
+static void Blit32to32SurfaceAlphaAltivec(SDL_BlitInfo *info)
+{
+    /* XXX : 6 */
+	unsigned alpha = info->src->alpha;
+    int height = info->d_height;
+    Uint32 *srcp = (Uint32 *)info->s_pixels;
+    int srcskip = info->s_skip >> 2;
+    Uint32 *dstp = (Uint32 *)info->d_pixels;
+    int dstskip = info->d_skip >> 2;
+    SDL_PixelFormat *srcfmt = info->src;
+    SDL_PixelFormat *dstfmt = info->dst;
+	unsigned sA = srcfmt->alpha;
+	unsigned dA = dstfmt->Amask ? SDL_ALPHA_OPAQUE : 0;
+    vector unsigned char mergePermute;
+    vector unsigned char vsrcPermute;
+    vector unsigned char vdstPermute;
+    vector unsigned char vsdstPermute;
+    vector unsigned char valpha;
+    vector unsigned char valphamask;
+    vector unsigned char vbits;
+    vector unsigned short v1;
+    vector unsigned short v8;
+
+    mergePermute = VEC_MERGE_PERMUTE();
+    v1 = vec_splat_u16(1);
+    v8 = vec_splat_u16(8);
+
+    /* set the alpha to 255 on the destination surf */
+    valphamask = VEC_ALPHA_MASK();
+
+    vsrcPermute = calc_swizzle32(srcfmt, NULL);
+    vdstPermute = calc_swizzle32(NULL, dstfmt);
+    vsdstPermute = calc_swizzle32(dstfmt, NULL);
+
+    /* set a vector full of alpha and 255-alpha */
+    ((unsigned char *)&valpha)[0] = alpha;
+    valpha = vec_splat(valpha, 0);
+    vbits = (vector unsigned char)vec_splat_s8(-1);
+
+    while(height--) {
+        int width = info->d_width;
+#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
+            Uint32 pixel; \
+            unsigned sR, sG, sB, dR, dG, dB; \
+            DISEMBLE_RGB(((Uint8 *)srcp), 4, srcfmt, pixel, sR, sG, sB); \
+            DISEMBLE_RGB(((Uint8 *)dstp), 4, dstfmt, pixel, dR, dG, dB); \
+            ACCURATE_ALPHA_BLEND(sR, sG, sB, sA, dR, dG, dB); \
+            ASSEMBLE_RGBA(((Uint8 *)dstp), 4, dstfmt, dR, dG, dB, dA); \
+            ++srcp; \
+            ++dstp; \
+            widthvar--; \
+        }
+        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
+        if (width > 0) {
+            int extrawidth = (width % 4);
+            vector unsigned char valigner = vec_lvsl(0, srcp);
+            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
+            width -= extrawidth;
+            while (width) {
+                vector unsigned char voverflow;
+                vector unsigned char vd;
+
+                /* s = *srcp */
+                voverflow = (vector unsigned char)vec_ld(15, srcp);
+                vs = vec_perm(vs, voverflow, valigner);
+                vs = vec_perm(vs, valpha, vsrcPermute);
+                
+                /* d = *dstp */
+                vd = (vector unsigned char)vec_ld(0, dstp);
+                vd = vec_perm(vd, vd, vsdstPermute);
+
+                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
+
+                /* set the alpha channel to full on */
+                vd = vec_or(vd, valphamask);
+                vd = vec_perm(vd, vbits, vdstPermute);
+
+                /* *dstp = res */
+                vec_st((vector unsigned int)vd, 0, dstp);
+                
+                srcp += 4;
+                dstp += 4;
+                width -= 4;
+                vs = voverflow;
+            }
+            ONE_PIXEL_BLEND((extrawidth), extrawidth);
+        }
+#undef ONE_PIXEL_BLEND
+ 
+        srcp += srcskip;
+        dstp += dstskip;
+    }
+
+}
+
+
+/* fast RGB888->(A)RGB888 blending */
+static void BlitRGBtoRGBSurfaceAlphaAltivec(SDL_BlitInfo *info)
+{
+	unsigned alpha = info->src->alpha;
+    int height = info->d_height;
+    Uint32 *srcp = (Uint32 *)info->s_pixels;
+    int srcskip = info->s_skip >> 2;
+    Uint32 *dstp = (Uint32 *)info->d_pixels;
+    int dstskip = info->d_skip >> 2;
+    vector unsigned char mergePermute;
+    vector unsigned char valpha;
+    vector unsigned char valphamask;
+    vector unsigned short v1;
+    vector unsigned short v8;
+
+    mergePermute = VEC_MERGE_PERMUTE();
+    v1 = vec_splat_u16(1);
+    v8 = vec_splat_u16(8);
+
+    /* set the alpha to 255 on the destination surf */
+    valphamask = VEC_ALPHA_MASK();
+
+    /* set a vector full of alpha and 255-alpha */
+    ((unsigned char *)&valpha)[0] = alpha;
+    valpha = vec_splat(valpha, 0);
+
+    while(height--) {
+        int width = info->d_width;
+#define ONE_PIXEL_BLEND(condition, widthvar) while ((condition)) { \
+            Uint32 s = *srcp; \
+            Uint32 d = *dstp; \
+            Uint32 s1 = s & 0xff00ff; \
+            Uint32 d1 = d & 0xff00ff; \
+            d1 = (d1 + ((s1 - d1) * alpha >> 8)) \
+                 & 0xff00ff; \
+            s &= 0xff00; \
+            d &= 0xff00; \
+            d = (d + ((s - d) * alpha >> 8)) & 0xff00; \
+            *dstp = d1 | d | 0xff000000; \
+            ++srcp; \
+            ++dstp; \
+            widthvar--; \
+        }
+        ONE_PIXEL_BLEND((UNALIGNED_PTR(dstp)) && (width), width);
+        if (width > 0) {
+            int extrawidth = (width % 4);
+            vector unsigned char valigner = VEC_ALIGNER(srcp);
+            vector unsigned char vs = (vector unsigned char)vec_ld(0, srcp);
+            width -= extrawidth;
+            while (width) {
+                vector unsigned char voverflow;
+                vector unsigned char vd;
+
+                /* s = *srcp */
+                voverflow = (vector unsigned char)vec_ld(15, srcp);
+                vs = vec_perm(vs, voverflow, valigner);
+                
+                /* d = *dstp */
+                vd = (vector unsigned char)vec_ld(0, dstp);
+
+                VEC_MULTIPLY_ALPHA(vs, vd, valpha, mergePermute, v1, v8);
+
+                /* set the alpha channel to full on */
+                vd = vec_or(vd, valphamask);
+
+                /* *dstp = res */
+                vec_st((vector unsigned int)vd, 0, dstp);
+                
+                srcp += 4;
+                dstp += 4;
+                width -= 4;
+                vs = voverflow;
+            }
+            ONE_PIXEL_BLEND((extrawidth), extrawidth);
+        }
+#undef ONE_PIXEL_BLEND
+ 
+        srcp += srcskip;
+        dstp += dstskip;
+    }
+}
+#endif /* USE_ALTIVEC_BLITTERS */
+
 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
 static void BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo *info)
 {
@@ -1372,7 +2128,12 @@
 	    if(df->BytesPerPixel == 1)
 		return BlitNto1SurfaceAlphaKey;
 	    else
-		return BlitNtoNSurfaceAlphaKey;
+#ifdef USE_ALTIVEC_BLITTERS
+        if (sf->BytesPerPixel == 4 && df->BytesPerPixel == 4 && SDL_HasAltiVec())
+            return Blit32to32SurfaceAlphaKeyAltivec;
+        else
+#endif
+            return BlitNtoNSurfaceAlphaKey;
 	} else {
 	    /* Per-surface alpha blits */
 	    switch(df->BytesPerPixel) {
@@ -1414,9 +2175,19 @@
 		    return BlitRGBtoRGBSurfaceAlphaMMX;
 		else
 #endif
+#ifdef USE_ALTIVEC_BLITTERS
+        if(SDL_HasAltiVec())
+            return BlitRGBtoRGBSurfaceAlphaAltivec;
+        else
+#endif
 		    return BlitRGBtoRGBSurfaceAlpha;
 		}
 		else
+#ifdef USE_ALTIVEC_BLITTERS
+        if((sf->BytesPerPixel == 4) && SDL_HasAltiVec())
+            return Blit32to32SurfaceAlphaAltivec;
+        else
+#endif
 		    return BlitNtoNSurfaceAlpha;
 
 	    case 3:
@@ -1431,6 +2202,13 @@
 	    return BlitNto1PixelAlpha;
 
 	case 2:
+#ifdef USE_ALTIVEC_BLITTERS
+        if(sf->BytesPerPixel == 4 && 
+           df->Gmask == 0x7e0 &&
+           df->Bmask == 0x1f)
+            return Blit32to565PixelAlphaAltivec;
+        else
+#endif
 	    if(sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
 	       && sf->Gmask == 0xff00
 	       && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
@@ -1457,8 +2235,18 @@
 		    return BlitRGBtoRGBPixelAlphaMMX;
 		else
 #endif
+#ifdef USE_ALTIVEC_BLITTERS
+        if(SDL_HasAltiVec())
+            return BlitRGBtoRGBPixelAlphaAltivec;
+        else
+#endif
 		    return BlitRGBtoRGBPixelAlpha;
 	    }
+#ifdef USE_ALTIVEC_BLITTERS
+        if (sf->Amask && sf->BytesPerPixel == 4 && SDL_HasAltiVec())
+            return Blit32to32PixelAlphaAltivec;
+        else
+#endif
 	    return BlitNtoNPixelAlpha;
 
 	case 3: