changeset 3144:0d8d1f870964 gsoc2009_ps3

Moved SPE-functions to SDL_ps3spe.c. Added ActivateRenderer() and PS3_QueryTexturePixels(). Added yuv2rgb_spu but not yet in use.
author Martin Lowinski <martin@goldtopf.org>
date Wed, 10 Jun 2009 09:15:33 +0000
parents 8fdabaa064c3
children 7828eed2f31a
files configure.in src/video/SDL_yuv_sw.c src/video/SDL_yuv_sw_c.h src/video/ps3/SDL_ps3render.c src/video/ps3/SDL_ps3spe.c src/video/ps3/SDL_ps3spe_c.h src/video/ps3/SDL_ps3video.c src/video/ps3/SDL_ps3video.h src/video/ps3/spulibs/Makefile src/video/ps3/spulibs/yuv2rgb_converter.c
diffstat 10 files changed, 1017 insertions(+), 220 deletions(-) [+]
line wrap: on
line diff
--- a/configure.in	Sat Jun 06 06:40:23 2009 +0000
+++ b/configure.in	Wed Jun 10 09:15:33 2009 +0000
@@ -1522,7 +1522,7 @@
       AC_DEFINE(SDL_VIDEO_DRIVER_PS3)
       SOURCES="$SOURCES $srcdir/src/video/ps3/*.c"
       EXTRA_CFLAGS="$EXTRA_CFLAGS -I/opt/cell/sdk/usr/include"
-      EXTRA_LDFLAGS="$EXTRA_LDFLAGS -L/opt/cell/sdk/usr/lib -lspe2 -lfb_writer_spu"
+      EXTRA_LDFLAGS="$EXTRA_LDFLAGS -L/opt/cell/sdk/usr/lib -lspe2 -lfb_writer_spu -lyuv2rgb_spu"
       have_video=yes
     fi   
   fi
--- a/src/video/SDL_yuv_sw.c	Sat Jun 06 06:40:23 2009 +0000
+++ b/src/video/SDL_yuv_sw.c	Wed Jun 10 09:15:33 2009 +0000
@@ -88,32 +88,6 @@
 #include "SDL_yuv_sw_c.h"
 
 
-struct SDL_SW_YUVTexture
-{
-    Uint32 format;
-    Uint32 target_format;
-    int w, h;
-    Uint8 *pixels;
-    int *colortab;
-    Uint32 *rgb_2_pix;
-    void (*Display1X) (int *colortab, Uint32 * rgb_2_pix,
-                       unsigned char *lum, unsigned char *cr,
-                       unsigned char *cb, unsigned char *out,
-                       int rows, int cols, int mod);
-    void (*Display2X) (int *colortab, Uint32 * rgb_2_pix,
-                       unsigned char *lum, unsigned char *cr,
-                       unsigned char *cb, unsigned char *out,
-                       int rows, int cols, int mod);
-
-    /* These are just so we don't have to allocate them separately */
-    Uint16 pitches[3];
-    Uint8 *planes[3];
-
-    /* This is a temporary surface in case we have to stretch copy */
-    SDL_Surface *stretch;
-    SDL_Surface *display;
-};
-
 /* The colorspace conversion functions */
 
 #if (__GNUC__ > 2) && defined(__i386__) && __OPTIMIZE__ && SDL_ASSEMBLY_ROUTINES
--- a/src/video/SDL_yuv_sw_c.h	Sat Jun 06 06:40:23 2009 +0000
+++ b/src/video/SDL_yuv_sw_c.h	Wed Jun 10 09:15:33 2009 +0000
@@ -26,6 +26,32 @@
 
 /* This is the software implementation of the YUV texture support */
 
+struct SDL_SW_YUVTexture
+{
+    Uint32 format;
+    Uint32 target_format;
+    int w, h;
+    Uint8 *pixels;
+    int *colortab;
+    Uint32 *rgb_2_pix;
+    void (*Display1X) (int *colortab, Uint32 * rgb_2_pix,
+                       unsigned char *lum, unsigned char *cr,
+                       unsigned char *cb, unsigned char *out,
+                       int rows, int cols, int mod);
+    void (*Display2X) (int *colortab, Uint32 * rgb_2_pix,
+                       unsigned char *lum, unsigned char *cr,
+                       unsigned char *cb, unsigned char *out,
+                       int rows, int cols, int mod);
+
+    /* These are just so we don't have to allocate them separately */
+    Uint16 pitches[3];
+    Uint8 *planes[3];
+
+    /* This is a temporary surface in case we have to stretch copy */
+    SDL_Surface *stretch;
+    SDL_Surface *display;
+};
+
 typedef struct SDL_SW_YUVTexture SDL_SW_YUVTexture;
 
 SDL_SW_YUVTexture *SDL_SW_CreateYUVTexture(Uint32 format, int w, int h);
--- a/src/video/ps3/SDL_ps3render.c	Sat Jun 06 06:40:23 2009 +0000
+++ b/src/video/ps3/SDL_ps3render.c	Wed Jun 10 09:15:33 2009 +0000
@@ -27,7 +27,7 @@
 #include "../SDL_renderer_sw.h"
 
 #include "SDL_ps3video.h"
-#include "spulibs/spu_common.h"
+#include "SDL_ps3spe_c.h"
 
 #include <fcntl.h>
 #include <stdlib.h>
@@ -35,13 +35,17 @@
 #include <linux/kd.h>
 #include <linux/fb.h>
 #include <sys/mman.h>
+#include <asm/ps3fb.h>
 
-#include <asm/ps3fb.h>
+
+/* Stores the executable name */
+extern spe_program_handle_t yuv2rgb_spu;
 
 /* SDL surface based renderer implementation */
 
 static SDL_Renderer *SDL_PS3_CreateRenderer(SDL_Window * window,
                                               Uint32 flags);
+static int SDL_PS3_ActivateRenderer(SDL_Renderer * renderer);
 static int SDL_PS3_RenderPoint(SDL_Renderer * renderer, int x, int y);
 static int SDL_PS3_RenderLine(SDL_Renderer * renderer, int x1, int y1,
                                 int x2, int y2);
@@ -56,6 +60,7 @@
 
 /* Texture */
 static int PS3_CreateTexture(SDL_Renderer * renderer, SDL_Texture * texture);
+static int PS3_QueryTexturePixels(SDL_Renderer * renderer, SDL_Texture * texture, void **pixels, int *pitch);
 static void PS3_DestroyTexture(SDL_Renderer * renderer, SDL_Texture * texture);
 
 
@@ -95,13 +100,18 @@
 
     /* Use two buffers in fb? res < 720p */
     unsigned int double_buffering;
+
+    /* SPE threading stuff */
+    spu_data_t * converter_thread_data;
+    /* YUV converting transfer data */
+    volatile struct yuv2rgb_parms_t * converter_parms __attribute__((aligned(128)));
 } SDL_PS3_RenderData;
 
 typedef struct
 {
-    void *pixels;
     int pitch;
     int bpp;
+    volatile void *pixels __attribute__((aligned(128)));
 } PS3_TextureData;
 
 SDL_Renderer *
@@ -138,6 +148,8 @@
 
     //renderer->CreateTexture = PS3_CreateTexture;
     //renderer->DestroyTexture = PS3_DestroyTexture;
+    //renderer->QueryTexturePixels = PS3_QueryTexturePixels;
+    renderer->ActivateRenderer = SDL_PS3_ActivateRenderer;
     renderer->RenderPoint = SDL_PS3_RenderPoint;
     renderer->RenderLine = SDL_PS3_RenderLine;
     renderer->RenderFill = SDL_PS3_RenderFill;
@@ -184,10 +196,42 @@
     }
     data->current_screen = 0;
 
+    /* Create SPU parms structure */
+    data->converter_parms = (struct yuv2rgb_parms_t *) memalign(16, sizeof(struct yuv2rgb_parms_t));
+    if (data->converter_parms == NULL) {
+        SDL_PS3_DestroyRenderer(renderer);
+        SDL_OutOfMemory();
+        return NULL;
+    }
+
+    /* Set up the SPEs */
+    data->converter_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t));
+    if (data->converter_thread_data == NULL) {
+        SDL_PS3_DestroyRenderer(renderer);
+        SDL_OutOfMemory();
+        return NULL;
+    }
+
+    data->converter_thread_data->program = yuv2rgb_spu;
+    data->converter_thread_data->program_name = "yuv2rgb_spu";
+    data->converter_thread_data->keepalive = 1;
+    data->converter_thread_data->booted = 0;
+
+    SPE_Start(data->converter_thread_data);
+
     return renderer;
 }
 
 static int
+SDL_PS3_ActivateRenderer(SDL_Renderer * renderer)
+{
+    deprintf(1, "PS3_ActivateRenderer()\n");
+    SDL_PS3_RenderData *data = (SDL_PS3_RenderData *) renderer->driverdata;
+
+    return 0;
+}
+
+static int
 PS3_CreateTexture(SDL_Renderer * renderer, SDL_Texture * texture) {
     deprintf(1, "PS3_CreateTexture()\n");
     PS3_TextureData *data;
@@ -197,6 +241,7 @@
         return -1;
     }
 
+    data->bpp = SDL_BYTESPERPIXEL(texture->format);
     data->pitch = (texture->w * SDL_BYTESPERPIXEL(texture->format));
 
     data->pixels = NULL;
@@ -211,16 +256,29 @@
     return 0;
 }
 
+static int
+PS3_QueryTexturePixels(SDL_Renderer * renderer, SDL_Texture * texture,
+                      void **pixels, int *pitch)
+{
+    PS3_TextureData *data = (PS3_TextureData *) texture->driverdata;
+
+    *pixels = (void *)data->pixels;
+    *pitch = data->pitch;
+
+    return 0;
+}
+
 static void
 PS3_DestroyTexture(SDL_Renderer * renderer, SDL_Texture * texture)
 {
+    deprintf(1, "PS3_DestroyTexture()\n");
     PS3_TextureData *data = (PS3_TextureData *) texture->driverdata;
 
     if (!data) {
         return;
     }
 
-    free(data->pixels);
+    free((void *)data->pixels);
 }
 
 static int
@@ -302,7 +360,8 @@
         (SDL_PS3_RenderData *) renderer->driverdata;
     SDL_Window *window = SDL_GetWindowFromID(renderer->window);
     SDL_VideoDisplay *display = SDL_GetDisplayFromWindow(window);
-    PS3_TextureData *txdata = (PS3_TextureData *) texture->driverdata;
+    //PS3_TextureData *txdata = (PS3_TextureData *) texture->driverdata;
+    SDL_SW_YUVTexture *txdata = (SDL_SW_YUVTexture *) texture->driverdata;
     SDL_VideoData *devdata = display->device->driverdata;
 
     if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) {
@@ -311,10 +370,41 @@
         void *pixels =
             (Uint8 *) target->pixels + dstrect->y * target->pitch +
             dstrect->x * target->format->BytesPerPixel;
+#if 0
+        /* Not yet tested */
+        Uint8 *lum, *Cr, *Cb;
+        SDL_SW_YUVTexture *swdata = (SDL_SW_YUVTexture *) texture->driverdata;
+        switch (swdata->format) {
+            case SDL_PIXELFORMAT_YV12:
+                lum = swdata->planes[0];
+                Cr = swdata->planes[1];
+                Cb = swdata->planes[2];
+                break;
+            case SDL_PIXELFORMAT_IYUV:
+                lum = swdata->planes[0];
+                Cr = swdata->planes[2];
+                Cb = swdata->planes[1];
+                break;
+            default:
+                return -1;
+        }
+
+        data->converter_parms->src_pixel_width = dstrect->w;
+        data->converter_parms->src_pixel_height = dstrect->h;
+        data->converter_parms->dstBuffer = (Uint8 *)pixels;
+        data->converter_thread_data->argp = (void *)data->converter_parms;
+
+        /* Convert YUV overlay to RGB */
+        SPE_SendMsg(data->converter_thread_data, SPU_START);
+        SPE_SendMsg(data->converter_thread_data, (unsigned int)data->converter_thread_data->argp);
+
+        return 0;
+#else
         return SDL_SW_CopyYUVToRGB((SDL_SW_YUVTexture *) texture->driverdata,
                                    srcrect, display->current_mode.format,
                                    dstrect->w, dstrect->h, pixels,
                                    target->pitch);
+#endif
     } else {
         deprintf(1, "SDL_ISPIXELFORMAT_FOURCC = false\n");
         SDL_Surface *surface = (SDL_Surface *) texture->driverdata;
@@ -330,12 +420,12 @@
         deprintf(1, "dstrect->w = %u\n", dstrect->w);
         deprintf(1, "dstrect->h = %u\n", dstrect->h);
 
-        deprintf(1, "txdata->bpp = %u\n", txdata->bpp);
+        //deprintf(1, "txdata->bpp = %u\n", txdata->bpp);
         deprintf(1, "texture->format (bpp) = %u\n", SDL_BYTESPERPIXEL(texture->format));
 
         /* For testing, align pixels */
-        void *pixels = (void *)memalign(16, dstrect->h * data->screens[0]->pitch);
-        SDL_memcpy(pixels, surface->pixels, dstrect->h * data->screens[0]->pitch);
+        void *pixels = (void *)memalign(16, window->h * window->w * 4);
+        SDL_memcpy(pixels, surface->pixels, window->h * window->w * 4);
 
         /* Get screeninfo */
         struct fb_fix_screeninfo fb_finfo;
@@ -349,9 +439,9 @@
             return -1;
         }
         /* 16 and 15 bpp is reported as 16 bpp */
-        txdata->bpp = fb_vinfo.bits_per_pixel;
-        if (txdata->bpp == 16)
-            txdata->bpp = fb_vinfo.red.length + fb_vinfo.green.length + fb_vinfo.blue.length;
+        //txdata->bpp = fb_vinfo.bits_per_pixel;
+        //if (txdata->bpp == 16)
+        //    txdata->bpp = fb_vinfo.red.length + fb_vinfo.green.length + fb_vinfo.blue.length;
 
         /* Adjust centering */
         data->bounded_width = window->w < fb_vinfo.xres ? window->w : fb_vinfo.xres;
@@ -372,7 +462,8 @@
         devdata->fb_parms->in_line_stride = dstrect->w * /*txdata->bpp / 8*/4;
         devdata->fb_parms->bounded_input_height = data->bounded_height;
         devdata->fb_parms->bounded_input_width = data->bounded_width;
-        devdata->fb_parms->fb_pixel_size = txdata->bpp / 8;
+        //devdata->fb_parms->fb_pixel_size = txdata->bpp / 8;
+        devdata->fb_parms->fb_pixel_size = SDL_BYTESPERPIXEL(texture->format);
 
         deprintf(3, "[PS3->SPU] fb_thread_data->argp = 0x%x\n", devdata->fb_thread_data->argp);
         
@@ -438,6 +529,16 @@
                 SDL_FreeSurface(data->screens[i]);
             }
         }
+
+        /* Shutdown SPE and related resources */
+        if (data->converter_parms) {
+            free((void *)data->converter_parms);
+        }
+        if (data->converter_thread_data) {
+            SPE_Shutdown(data->converter_thread_data);
+            free((void *)data->converter_thread_data);
+        }
+
         SDL_free(data);
     }
     SDL_free(renderer);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/video/ps3/SDL_ps3spe.c	Wed Jun 10 09:15:33 2009 +0000
@@ -0,0 +1,167 @@
+/*
+    SDL - Simple DirectMedia Layer
+    Copyright (C) 1997-2009 Sam Lantinga
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    Sam Lantinga
+    slouken@libsdl.org
+*/
+#include "SDL_config.h"
+
+#include "SDL_video.h"
+#include "SDL_ps3spe_c.h"
+
+#include "SDL_ps3video.h"
+#include "SDL_ps3render_c.h"
+
+
+/* Start the SPE thread */
+int SPE_Start(spu_data_t * spe_data)
+{
+  deprintf(2, "[PS3->SPU] Start SPE: %s\n", spe_data->program_name);
+  if (!(spe_data->booted))
+    SPE_Boot(spe_data);
+
+  /* To allow re-running of context, spe_ctx_entry has to be set before each call */
+  spe_data->entry = SPE_DEFAULT_ENTRY;
+  spe_data->error_code = 0;
+
+  /* Create SPE thread and run */
+  deprintf(2, "[PS3->SPU] Create Thread: %s\n", spe_data->program_name);
+  if (pthread_create
+      (&spe_data->thread, NULL, (void *)&SPE_RunContext, (void *)spe_data)) {
+    deprintf(2, "[PS3->SPU] Could not create pthread for spe: %s\n", spe_data->program_name);
+    SDL_SetError("[PS3->SPU] Could not create pthread for spe");
+    return -1;
+  }
+
+  if (spe_data->keepalive)
+    SPE_WaitForMsg(spe_data, SPU_READY);
+}
+
+/* Stop the SPE thread */
+int SPE_Stop(spu_data_t * spe_data)
+{
+  deprintf(2, "[PS3->SPU] Stop SPE: %s\n", spe_data->program_name);
+  /* Wait for SPE thread to complete */
+  deprintf(2, "[PS3->SPU] Wait for SPE thread to complete: %s\n", spe_data->program_name);
+  if (pthread_join(spe_data->thread, NULL)) {
+    deprintf(2, "[PS3->SPU] Failed joining the thread: %s\n", spe_data->program_name);
+    SDL_SetError("[PS3->SPU] Failed joining the thread");
+    return -1;
+  }
+
+  return 0;
+}
+
+/* Create SPE context and load program */
+int SPE_Boot(spu_data_t * spe_data)
+{
+  /* Create SPE context */
+  deprintf(2, "[PS3->SPU] Create SPE Context: %s\n", spe_data->program_name);
+  spe_data->ctx = spe_context_create(0, NULL);
+  if (spe_data->ctx == NULL) {
+    deprintf(2, "[PS3->SPU] Failed creating SPE context: %s\n", spe_data->program_name);
+    SDL_SetError("[PS3->SPU] Failed creating SPE context");
+    return -1;
+  }
+
+  /* Load SPE object into SPE local store */
+  deprintf(2, "[PS3->SPU] Load Program into SPE: %s\n", spe_data->program_name);
+  if (spe_program_load(spe_data->ctx, &spe_data->program)) {
+    deprintf(2, "[PS3->SPU] Failed loading program into SPE context: %s\n", spe_data->program_name);
+    SDL_SetError
+        ("[PS3->SPU] Failed loading program into SPE context");
+    return -1;
+  }
+  spe_data->booted = 1;
+  deprintf(2, "[PS3->SPU] SPE boot successful\n");
+
+  return 0;
+}
+
+/* (Stop and) shutdown the SPE */
+int SPE_Shutdown(spu_data_t * spe_data)
+{
+  if (spe_data->keepalive && spe_data->booted) {
+    SPE_SendMsg(spe_data, SPU_EXIT);
+    SPE_Stop(spe_data);
+  }
+
+  /* Destroy SPE context */
+  deprintf(2, "[PS3->SPU] Destroy SPE context: %s\n", spe_data->program_name);
+  if (spe_context_destroy(spe_data->ctx)) {
+    deprintf(2, "[PS3->SPU] Failed destroying context: %s\n", spe_data->program_name);
+    SDL_SetError("[PS3->SPU] Failed destroying context");
+    return -1;
+  }
+  deprintf(2, "[PS3->SPU] SPE shutdown successful: %s\n", spe_data->program_name);
+  return 0;
+}
+
+/* Send message to the SPE via mailboxe */
+int SPE_SendMsg(spu_data_t * spe_data, unsigned int msg)
+{
+  deprintf(2, "[PS3->SPU] Sending message %u to %s\n", msg, spe_data->program_name);
+  /* Send one message, block until message was sent */
+  unsigned int spe_in_mbox_msgs[1];
+  spe_in_mbox_msgs[0] = msg;
+  int in_mbox_write = spe_in_mbox_write(spe_data->ctx, spe_in_mbox_msgs, 1, SPE_MBOX_ALL_BLOCKING);
+
+  if (1 > in_mbox_write) {
+    deprintf(2, "[PS3->SPU] No message could be written to %s\n", spe_data->program_name);
+    SDL_SetError("[PS3->SPU] No message could be written");
+    return -1;
+  }
+  return 0;
+}
+
+
+/* Read 1 message from SPE, block until at least 1 message was received */
+int SPE_WaitForMsg(spu_data_t * spe_data, unsigned int msg)
+{
+  deprintf(2, "[PS3->SPU] Waiting for message from %s\n", spe_data->program_name);
+  unsigned int out_messages[1];
+  while (!spe_out_mbox_status(spe_data->ctx));
+  int mbox_read = spe_out_mbox_read(spe_data->ctx, out_messages, 1);
+  deprintf(2, "[PS3->SPU] Got message from %s, message was %u\n", spe_data->program_name, out_messages[0]);
+  if (out_messages[0] == msg)
+    return 0;
+  else
+    return -1;
+}
+
+/* Re-runnable invocation of the spe_context_run call */
+void SPE_RunContext(void *thread_argp)
+{ 
+  /* argp is the pointer to argument to be passed to the SPE program */
+  spu_data_t *args = (spu_data_t *) thread_argp;
+  deprintf(3, "[PS3->SPU] void* argp=0x%x\n", (unsigned int)args->argp);
+  
+  /* Run it.. */
+  deprintf(2, "[PS3->SPU] Run SPE program: %s\n", args->program_name);
+  if (spe_context_run
+      (args->ctx, &args->entry, 0, (void *)args->argp, NULL,
+       NULL) < 0) {
+    deprintf(2, "[PS3->SPU] Failed running SPE context: %s\n", args->program_name);
+    SDL_SetError("[PS3->SPU] Failed running SPE context: %s", args->program_name);
+    exit(1);
+  }
+
+  pthread_exit(NULL);
+}
+
+/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/video/ps3/SDL_ps3spe_c.h	Wed Jun 10 09:15:33 2009 +0000
@@ -0,0 +1,55 @@
+/*
+    SDL - Simple DirectMedia Layer
+    Copyright (C) 1997-2009 Sam Lantinga
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+    Sam Lantinga
+    slouken@libsdl.org
+*/
+#include "SDL_config.h"
+
+#include "spulibs/spu_common.h"
+
+#include <libspe2.h>
+
+#ifndef _SDL_ps3spe_h
+#define _SDL_ps3spe_h
+
+/* SPU thread data */
+typedef struct spu_data {
+    spe_context_ptr_t ctx;
+    spe_program_handle_t program;
+    pthread_t thread;
+    char * program_name;
+    unsigned int booted;
+    unsigned int keepalive;
+    unsigned int entry;
+    int error_code;
+    void * argp;
+} spu_data_t;
+
+/* SPU specific functions */
+int SPE_Start(spu_data_t * spe_data);
+int SPE_Stop(spu_data_t * spe_data);
+int SPE_Boot(spu_data_t * spe_data);
+int SPE_Shutdown(spu_data_t * spe_data);
+int SPE_SendMsg(spu_data_t * spe_data, unsigned int msg);
+int SPE_WaitForMsg(spu_data_t * spe_data, unsigned int msg);
+void SPE_RunContext(void *thread_argp);
+
+#endif /* _SDL_ps3spe_h */
+
+/* vi: set ts=4 sw=4 expandtab: */
--- a/src/video/ps3/SDL_ps3video.c	Sat Jun 06 06:40:23 2009 +0000
+++ b/src/video/ps3/SDL_ps3video.c	Wed Jun 10 09:15:33 2009 +0000
@@ -33,16 +33,15 @@
 #include "../SDL_sysvideo.h"
 #include "../SDL_pixels_c.h"
 #include "../../events/SDL_events_c.h"
-#include "spulibs/spu_common.h"
 
 #include "SDL_ps3video.h"
+#include "SDL_ps3spe_c.h"
 #include "SDL_ps3events_c.h"
 #include "SDL_ps3render_c.h"
 
 #include <fcntl.h>
 #include <linux/fb.h>
 #include <asm/ps3fb.h>
-#include <libspe2.h>
 #include <sys/mman.h>
 
 #define PS3VID_DRIVER_NAME "ps3"
@@ -52,15 +51,6 @@
 static int PS3_SetDisplayMode(_THIS, SDL_DisplayMode * mode);
 static void PS3_VideoQuit(_THIS);
 
-/* SPU specific functions */
-int SPE_Start(_THIS, spu_data_t * spe_data);
-int SPE_Stop(_THIS, spu_data_t * spe_data);
-int SPE_Boot(_THIS, spu_data_t * spe_data);
-int SPE_Shutdown(_THIS, spu_data_t * spe_data);
-int SPE_SendMsg(spu_data_t * spe_data, unsigned int msg);
-int SPE_WaitForMsg(spu_data_t * spe_data, unsigned int msg);
-void SPE_RunContext(void *thread_argp);
-
 /* Stores the SPE executable name of fb_writer_spu */
 extern spe_program_handle_t fb_writer_spu;
 
@@ -167,7 +157,7 @@
     data->fb_thread_data->keepalive = 1;
     data->fb_thread_data->booted = 0;
 
-    SPE_Start(_this, data->fb_thread_data);
+    SPE_Start(data->fb_thread_data);
 
     /* Open the device */
     data->fbdev = open(PS3DEV, O_RDWR);
@@ -198,7 +188,7 @@
         SDL_SetError("[PS3] Can't mmap for %s", PS3DEV);
         return (0);
     } else {
-        //current->flags |= SDL_DOUBLEBUF;
+        /* Enable double buffering */
     }
 
     /* Blank screen */
@@ -220,6 +210,8 @@
 {
     deprintf(1, "PS3_VideoQuit()\n");
     SDL_VideoData *data = (SDL_VideoData *) _this->driverdata;
+
+    /* Unmap framebuffer */
     if (data->frame_buffer) {
         struct fb_fix_screeninfo fb_finfo;
         if (ioctl(data->fbdev, FBIOGET_FSCREENINFO, &fb_finfo) != -1) {
@@ -228,154 +220,21 @@
         }
     }
 
+    /* Shutdown SPE and related resources */
     if (data->fb_parms)
         free((void *)data->fb_parms);
     if (data->fb_thread_data) {
-        SPE_Shutdown(_this, data->fb_thread_data);
+        SPE_Shutdown(data->fb_thread_data);
         free((void *)data->fb_thread_data);
     }
+
+    /* Close device */
+    if (data->fbdev > 0) {
+        /* Give control of frame buffer back to kernel */
+        ioctl(data->fbdev, PS3FB_IOCTL_OFF, 0);
+        close(data->fbdev);
+        data->fbdev = -1;
+    }
 }
 
-
-/*
- * SPE handling
- */
-
-/* Start the SPE thread */
-int SPE_Start(_THIS, spu_data_t * spe_data)
-{
-  deprintf(2, "[PS3->SPU] Start SPE: %s\n", spe_data->program_name);
-  if (!(spe_data->booted))
-    SPE_Boot(_this, spe_data);
-
-  /* To allow re-running of context, spe_ctx_entry has to be set before each call */
-  spe_data->entry = SPE_DEFAULT_ENTRY;
-  spe_data->error_code = 0;
-
-  /* Create SPE thread and run */
-  deprintf(2, "[PS3->SPU] Create Thread: %s\n", spe_data->program_name);
-  if (pthread_create
-      (&spe_data->thread, NULL, (void *)&SPE_RunContext, (void *)spe_data)) {
-    deprintf(2, "[PS3->SPU] Could not create pthread for spe: %s\n", spe_data->program_name);
-    SDL_SetError("[PS3->SPU] Could not create pthread for spe");
-    return -1;
-  }
-
-  if (spe_data->keepalive)
-    SPE_WaitForMsg(spe_data, SPU_READY);
-}
-
-
-/* Stop the SPE thread */
-int SPE_Stop(_THIS, spu_data_t * spe_data)
-{
-  deprintf(2, "[PS3->SPU] Stop SPE: %s\n", spe_data->program_name);
-  /* Wait for SPE thread to complete */
-  deprintf(2, "[PS3->SPU] Wait for SPE thread to complete: %s\n", spe_data->program_name);
-  if (pthread_join(spe_data->thread, NULL)) {
-    deprintf(2, "[PS3->SPU] Failed joining the thread: %s\n", spe_data->program_name);
-    SDL_SetError("[PS3->SPU] Failed joining the thread");
-    return -1;
-  }
-
-  return 0;
-}
-
-/* Create SPE context and load program */
-int SPE_Boot(_THIS, spu_data_t * spe_data)
-{
-  /* Create SPE context */
-  deprintf(2, "[PS3->SPU] Create SPE Context: %s\n", spe_data->program_name);
-  spe_data->ctx = spe_context_create(0, NULL);
-  if (spe_data->ctx == NULL) {
-    deprintf(2, "[PS3->SPU] Failed creating SPE context: %s\n", spe_data->program_name);
-    SDL_SetError("[PS3->SPU] Failed creating SPE context");
-    return -1;
-  }
-
-  /* Load SPE object into SPE local store */
-  deprintf(2, "[PS3->SPU] Load Program into SPE: %s\n", spe_data->program_name);
-  if (spe_program_load(spe_data->ctx, &spe_data->program)) {
-    deprintf(2, "[PS3->SPU] Failed loading program into SPE context: %s\n", spe_data->program_name);
-    SDL_SetError
-        ("[PS3->SPU] Failed loading program into SPE context");
-    return -1;
-  }
-  spe_data->booted = 1;
-  deprintf(2, "[PS3->SPU] SPE boot successful\n");
-
-  return 0;
-}
-
-/* (Stop and) shutdown the SPE */
-int SPE_Shutdown(_THIS, spu_data_t * spe_data)
-{
-  if (spe_data->keepalive && spe_data->booted) {
-    SPE_SendMsg(spe_data, SPU_EXIT);
-    SPE_Stop(_this, spe_data);
-  }
-
-  /* Destroy SPE context */
-  deprintf(2, "[PS3->SPU] Destroy SPE context: %s\n", spe_data->program_name);
-  if (spe_context_destroy(spe_data->ctx)) {
-    deprintf(2, "[PS3->SPU] Failed destroying context: %s\n", spe_data->program_name);
-    SDL_SetError("[PS3->SPU] Failed destroying context");
-    return -1;
-  }
-  deprintf(2, "[PS3->SPU] SPE shutdown successful: %s\n", spe_data->program_name);
-  return 0;
-}
-
-/* Send message to the SPE via mailboxe */
-int SPE_SendMsg(spu_data_t * spe_data, unsigned int msg)
-{
-  deprintf(2, "[PS3->SPU] Sending message %u to %s\n", msg, spe_data->program_name);
-  /* Send one message, block until message was sent */
-  unsigned int spe_in_mbox_msgs[1];
-  spe_in_mbox_msgs[0] = msg;
-  int in_mbox_write = spe_in_mbox_write(spe_data->ctx, spe_in_mbox_msgs, 1, SPE_MBOX_ALL_BLOCKING);
-
-  if (1 > in_mbox_write) {
-    deprintf(2, "[PS3->SPU] No message could be written to %s\n", spe_data->program_name);
-    SDL_SetError("[PS3->SPU] No message could be written");
-    return -1;
-  }
-  return 0;
-}
-
-
-/* Read 1 message from SPE, block until at least 1 message was received */
-int SPE_WaitForMsg(spu_data_t * spe_data, unsigned int msg)
-{
-  deprintf(2, "[PS3->SPU] Waiting for message from %s\n", spe_data->program_name);
-  unsigned int out_messages[1];
-  while (!spe_out_mbox_status(spe_data->ctx));
-  int mbox_read = spe_out_mbox_read(spe_data->ctx, out_messages, 1);
-  deprintf(2, "[PS3->SPU] Got message from %s, message was %u\n", spe_data->program_name, out_messages[0]);
-  if (out_messages[0] == msg)
-    return 0;
-  else
-    return -1;
-}
-
-/* Re-runnable invocation of the spe_context_run call */
-void SPE_RunContext(void *thread_argp)
-{ 
-  /* argp is the pointer to argument to be passed to the SPE program */
-  spu_data_t *args = (spu_data_t *) thread_argp;
-  deprintf(3, "[PS3->SPU] void* argp=0x%x\n", (unsigned int)args->argp);
-  
-  /* Run it.. */
-  deprintf(2, "[PS3->SPU] Run SPE program: %s\n", args->program_name);
-  if (spe_context_run
-      (args->ctx, &args->entry, 0, (void *)args->argp, NULL,
-       NULL) < 0) {
-    deprintf(2, "[PS3->SPU] Failed running SPE context: %s\n", args->program_name);
-    SDL_SetError("[PS3->SPU] Failed running SPE context: %s", args->program_name);
-    exit(1);
-  }
-
-  pthread_exit(NULL);
-}
-
 /* vi: set ts=4 sw=4 expandtab: */
--- a/src/video/ps3/SDL_ps3video.h	Sat Jun 06 06:40:23 2009 +0000
+++ b/src/video/ps3/SDL_ps3video.h	Wed Jun 10 09:15:33 2009 +0000
@@ -21,12 +21,11 @@
 */
 #include "SDL_config.h"
 
-#include <libspe2.h>
-
 #ifndef _SDL_ps3video_h
 #define _SDL_ps3video_h
 
 #include "../SDL_sysvideo.h"
+#include "SDL_ps3spe_c.h"
 
 /* Debugging
  * 0: No debug messages
@@ -53,19 +52,6 @@
 /* Default framebuffer device on PS3 */
 #define PS3DEV "/dev/fb0"
 
-/* SPU thread data */
-typedef struct spu_data {
-    spe_context_ptr_t ctx;
-    spe_program_handle_t program;
-    pthread_t thread;
-    char * program_name;
-    unsigned int booted;
-    unsigned int keepalive;
-    unsigned int entry;
-    int error_code;
-    void * argp;
-} spu_data_t;
-
 /* Private display data */
 typedef struct SDL_VideoData
 {
--- a/src/video/ps3/spulibs/Makefile	Sat Jun 06 06:40:23 2009 +0000
+++ b/src/video/ps3/spulibs/Makefile	Wed Jun 10 09:15:33 2009 +0000
@@ -15,8 +15,8 @@
 PREFIX=/usr/lib
 
 
-all: libfb_writer_spu.a libfb_writer_spu.so
-#				libyuv2rgb_spu.so libyuv2rgb_spu.a \
+all: libfb_writer_spu.a libfb_writer_spu.so \
+				libyuv2rgb_spu.so libyuv2rgb_spu.a
 #				libbilin_scaler_spu.so libbilin_scaler_spu.a
 
 
@@ -55,24 +55,24 @@
 libbilin_scaler_spu.so: bilin_scaler_spu-embed.o
 	$(PPU_LD) -o libbilin_scaler_spu.so -shared -soname=libbilin_scaler_spu.so bilin_scaler_spu-embed.o
 
-install: libfb_writer_spu.a libfb_writer_spu.so
-#				libyuv2rgb_spu.so libyuv2rgb_spu.a \
+install: libfb_writer_spu.a libfb_writer_spu.so \
+				libyuv2rgb_spu.so libyuv2rgb_spu.a
 #				libbilin_scaler_spu.so libbilin_scaler_spu.a
 	$(INSTALL) -c -m 0755 libfb_writer_spu.so $(PREFIX)/.
 	$(INSTALL) -c -m 0655 libfb_writer_spu.a $(PREFIX)/.
-#	$(INSTALL) -c -m 0755 libyuv2rgb_spu.so $(PREFIX)/.
-#	$(INSTALL) -c -m 0655 libyuv2rgb_spu.a $(PREFIX)/.
+	$(INSTALL) -c -m 0755 libyuv2rgb_spu.so $(PREFIX)/.
+	$(INSTALL) -c -m 0655 libyuv2rgb_spu.a $(PREFIX)/.
 #	$(INSTALL) -c -m 0755 libbilin_scaler_spu.so $(PREFIX)/.
 #	$(INSTALL) -c -m 0655 libbilin_scaler_spu.a $(PREFIX)/.
 
 
-uninstall: $(PREFIX)/libfb_writer_spu.so $(PREFIX)/libfb_writer_spu.a
-#		$(PREFIX)/libyuv2rgb_spu.so $(PREFIX)/libyuv2rgb_spu.a \
+uninstall: $(PREFIX)/libfb_writer_spu.so $(PREFIX)/libfb_writer_spu.a \
+		$(PREFIX)/libyuv2rgb_spu.so $(PREFIX)/libyuv2rgb_spu.a
 #		$(PREFIX)/libbilin_scaler_spu.so $(PREFIX)/libbilin_scaler_spu.a
 	rm -f $(PREFIX)/libfb_writer_spu.a
 	rm -f $(PREFIX)/libfb_writer_spu.so
-#	rm -f $(PREFIX)/libyuv2rgb_spu.so
-#	rm -f $(PREFIX)/libyuv2rgb_spu.a
+	rm -f $(PREFIX)/libyuv2rgb_spu.so
+	rm -f $(PREFIX)/libyuv2rgb_spu.a
 #	rm -f $(PREFIX)/libbilin_scaler_spu.so
 #	rm -f $(PREFIX)/libbilin_scaler_spu.a
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/video/ps3/spulibs/yuv2rgb_converter.c	Wed Jun 10 09:15:33 2009 +0000
@@ -0,0 +1,629 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "spu_common.h"
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+
+// Debugging
+//#define DEBUG
+
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+	fprintf( stdout, fmt, ##args ); \
+	fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+
+struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
+
+/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
+ * there might be the need to retrieve misaligned data, adjust
+ * incoming v and u plane to be able to handle this (add 128)
+ */
+unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
+unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+
+/* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
+unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
+
+/* some vectors needed by the float to int conversion */
+static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
+static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
+
+void yuv_to_rgb_w16();
+void yuv_to_rgb_w32();
+
+void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
+
+
+int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
+{
+	deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
+	uint32_t ea_mfc, mbox;
+	// send ready message
+	spu_write_out_mbox(SPU_READY);
+
+	while (1) {
+		/* Check mailbox */
+		mbox = spu_read_in_mbox();
+		deprintf("[SPU] Message is %u\n", mbox);
+		switch (mbox) {
+			case SPU_EXIT:
+				deprintf("[SPU] fb_writer goes down...\n");
+				return 0;
+			case SPU_START:
+				break;
+			default:
+				deprintf("[SPU] Cannot handle message\n");
+				continue;
+		}
+
+		/* Tag Manager setup */
+		unsigned int tag_id;
+		tag_id = mfc_multi_tag_reserve(1);
+		if (tag_id == MFC_TAG_INVALID) {
+			deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
+			return 0;
+		}
+
+		/* DMA transfer for the input parameters */
+		ea_mfc = spu_read_in_mbox();
+		deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
+		spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
+		DMA_WAIT_TAG(tag_id);
+
+		/* There are alignment issues that involve handling of special cases
+		 * a width of 32 results in a width of 16 in the chrominance
+		 * --> choose the proper handling to optimize the performance
+		 */
+		deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
+		if (parms_converter.src_pixel_width & 0x1f) {
+			deprintf("[SPU] Using yuv_to_rgb_w16\n");
+			yuv_to_rgb_w16();
+		} else {
+			deprintf("[SPU] Using yuv_to_rgb_w32\n");
+			yuv_to_rgb_w32();
+		}
+
+		mfc_multi_tag_release(tag_id, 1);
+		deprintf("[SPU] yuv2rgb_spu... done!\n");
+		/* Send FIN message */
+		spu_write_out_mbox(SPU_FIN);
+	}
+
+	return 0;
+}
+
+
+/*
+ * float_to_char()
+ *
+ * converts a float to a character using saturated
+ * arithmetic
+ *
+ * @param s float for conversion
+ * @returns converted character
+ */
+inline static unsigned char float_to_char(float s) {
+	vector float vec_s = spu_splats(s);
+	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+	vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+	vec_s = spu_sel(vec_s, vec_255, select_2);
+	return (unsigned char) spu_extract(vec_s,0);
+}
+
+
+/*
+ * vfloat_to_vuint()
+ *
+ * converts a float vector to an unsinged int vector using saturated
+ * arithmetic
+ *
+ * @param vec_s float vector for conversion
+ * @returns converted unsigned int vector
+ */
+inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
+	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+	vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+	vec_s = spu_sel(vec_s, vec_255, select_2);
+	return spu_convtu(vec_s,0);
+}
+
+
+void yuv_to_rgb_w16() {
+	// Pixel dimensions of the picture
+	uint32_t width, height;
+
+	// Extract parameters
+	width = parms_converter.src_pixel_width;
+	height = parms_converter.src_pixel_height;
+
+	// Plane data management
+	// Y
+	unsigned char* ram_addr_y = parms_converter.y_plane;
+	// V
+	unsigned char* ram_addr_v = parms_converter.v_plane;
+	// U
+	unsigned char* ram_addr_u = parms_converter.u_plane;
+
+	// BGRA
+	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+
+	// Strides
+	unsigned int stride_y = width;
+	unsigned int stride_vu = width>>1;
+
+	// Buffer management
+	unsigned int buf_idx = 0;
+	unsigned int size_4lines_y = stride_y<<2;
+	unsigned int size_2lines_y = stride_y<<1;
+	unsigned int size_2lines_vu = stride_vu<<1;
+
+	// 2*width*4byte_per_pixel
+	unsigned int size_2lines_bgra = width<<3;
+
+
+	// start double-buffered processing
+	// 4 lines y
+	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+	// 2 lines v
+	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+	// 2 lines u
+	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+	// Wait for these transfers to be completed
+	DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+	unsigned int i;
+	for(i=0; i<(height>>2)-1; i++) {
+
+		buf_idx^=1;
+
+		// 4 lines y
+		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+		// 2 lines v
+		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+		// 2 lines u
+		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+		DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+		buf_idx^=1;
+
+
+		// Convert YUV to BGRA, store it back (first two lines)
+		yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+		// Next two lines
+		yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+				v_plane[buf_idx] + stride_vu,
+				u_plane[buf_idx] + stride_vu,
+				bgra + size_2lines_bgra,
+				width);
+
+		// Wait for previous storing transfer to be completed
+		DMA_WAIT_TAG(STR_BUF);
+
+		// Store converted lines in two steps->max transfer size 16384
+		spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+		spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+
+		// Move 4 lines
+		ram_addr_y += size_4lines_y;
+		ram_addr_v += size_2lines_vu;
+		ram_addr_u += size_2lines_vu;
+
+		buf_idx^=1;
+	}
+
+	// Convert YUV to BGRA, store it back (first two lines)
+	yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+	// Next two lines
+	yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+			v_plane[buf_idx] + stride_vu,
+			u_plane[buf_idx] + stride_vu,
+			bgra + size_2lines_bgra,
+			width);
+
+	// Wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+	ram_addr_bgra += size_2lines_bgra;
+	spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+
+	// wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+
+}
+
+
+void yuv_to_rgb_w32() {
+	// Pixel dimensions of the picture
+	uint32_t width, height;
+
+	// Extract parameters
+	width = parms_converter.src_pixel_width;
+	height = parms_converter.src_pixel_height;
+
+	// Plane data management
+	// Y
+	unsigned char* ram_addr_y = parms_converter.y_plane;
+	// V
+	unsigned char* ram_addr_v = parms_converter.v_plane;
+	// U
+	unsigned char* ram_addr_u = parms_converter.u_plane;
+
+	// BGRA
+	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+
+	// Strides
+	unsigned int stride_y = width;
+	unsigned int stride_vu = width>>1;
+
+	// Buffer management
+	unsigned int buf_idx = 0;
+	unsigned int size_4lines_y = stride_y<<2;
+	unsigned int size_2lines_y = stride_y<<1;
+	unsigned int size_2lines_vu = stride_vu<<1;
+
+	// 2*width*4byte_per_pixel
+	unsigned int size_2lines_bgra = width<<3;
+
+	// start double-buffered processing
+	// 4 lines y
+	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+	// 2 lines v
+	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+	// 2 lines u
+	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+
+	// Wait for these transfers to be completed
+	DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+	unsigned int i;
+	for(i=0; i < (height>>2)-1; i++) {
+		buf_idx^=1;
+		// 4 lines y
+		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+		deprintf("4lines = %d\n", size_4lines_y);
+		// 2 lines v
+		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+		deprintf("2lines = %d\n", size_2lines_vu);
+		// 2 lines u
+		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+		deprintf("2lines = %d\n", size_2lines_vu);
+
+		DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+		buf_idx^=1;
+
+		// Convert YUV to BGRA, store it back (first two lines)
+		yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+		// Next two lines
+		yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+				v_plane[buf_idx] + stride_vu,
+				u_plane[buf_idx] + stride_vu,
+				bgra + size_2lines_bgra,
+				width);
+
+		// Wait for previous storing transfer to be completed
+		DMA_WAIT_TAG(STR_BUF);
+
+		// Store converted lines in two steps->max transfer size 16384
+		spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+		spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+
+		// Move 4 lines
+		ram_addr_y += size_4lines_y;
+		ram_addr_v += size_2lines_vu;
+		ram_addr_u += size_2lines_vu;
+
+		buf_idx^=1;
+	}
+
+	// Convert YUV to BGRA, store it back (first two lines)
+	yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+	// Next two lines
+	yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+			v_plane[buf_idx] + stride_vu,
+			u_plane[buf_idx] + stride_vu,
+			bgra + size_2lines_bgra,
+			width);
+
+	// Wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+	ram_addr_bgra += size_2lines_bgra;
+	spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+
+	// Wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+}
+
+
+/* Some vectors needed by the yuv 2 rgb conversion algorithm */
+const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
+const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
+const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
+const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
+const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
+
+const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
+const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
+const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
+const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
+
+const vector unsigned int vec_alpha =  { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
+
+const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
+const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
+
+
+/*
+ * yuv_to_rgb_w16()
+ *
+ * processes to line of yuv-input, width has to be a multiple of 16
+ * two lines of yuv are taken as input
+ *
+ * @param y_addr address of the y plane in local store
+ * @param v_addr address of the v plane in local store
+ * @param u_addr address of the u plane in local store
+ * @param bgra_addr_ address of the bgra output buffer
+ * @param width the width in pixel
+ */
+void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
+	// each pixel is stored as an integer
+	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
+
+	unsigned int x;
+	for(x = 0; x < width; x+=2) {
+		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
+		const unsigned char Y_1 = *(y_addr + x);
+		const unsigned char Y_2 = *(y_addr + x + 1);
+		const unsigned char Y_3 = *(y_addr + x + width);
+		const unsigned char Y_4 = *(y_addr + x + width + 1);
+		const unsigned char U = *(u_addr + (x >> 1));
+		const unsigned char V = *(v_addr + (x >> 1));
+
+		float V_minus_128 = (float)((float)V - 128.0f);
+		float U_minus_128 = (float)((float)U - 128.0f);
+
+		float R_precalculate = 1.403f * V_minus_128;
+		float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
+		float B_precalculate = 1.773f * U_minus_128;
+
+		const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
+		const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
+		const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
+		const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
+		const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
+		const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
+		const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
+		const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
+		const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
+		const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
+		const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
+		const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
+
+		*(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
+		*(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
+		*(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
+		*(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
+	}
+}
+
+
+/*
+ * yuv_to_rgb_w32()
+ *
+ * processes to line of yuv-input, width has to be a multiple of 32
+ * two lines of yuv are taken as input
+ *
+ * @param y_addr address of the y plane in local store
+ * @param v_addr address of the v plane in local store
+ * @param u_addr address of the u plane in local store
+ * @param bgra_addr_ address of the bgra output buffer
+ * @param width the width in pixel
+ */
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
+	// each pixel is stored as an integer
+	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
+
+	unsigned int x;
+	for(x = 0; x < width; x+=32) {
+		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
+
+		const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
+		const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
+		const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
+		const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
+		const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
+		const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
+
+		const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
+		const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
+		const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
+		const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
+
+		const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
+		const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
+		const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
+		const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
+
+		vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
+		vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
+		vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
+		vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
+		vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
+		vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
+		vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
+		vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
+		vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
+		vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
+		vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
+		vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
+		vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
+		vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
+		vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
+		vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
+
+		const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
+		const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
+		const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
+		const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
+
+		const vector float R1_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_upper);
+		const vector float R2_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_lower);
+		const vector float R3_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_upper);
+		const vector float R4_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_lower);
+		const vector float R5_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_upper);
+		const vector float R6_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_lower);
+		const vector float R7_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_upper);
+		const vector float R8_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_lower);
+
+
+		const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
+		const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
+		const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
+		const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
+
+		const vector float G1_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_upper);
+		const vector float G2_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_lower);
+		const vector float G3_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_upper);
+		const vector float G4_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_lower);
+		const vector float G5_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_upper);
+		const vector float G6_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_lower);
+		const vector float G7_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_upper);
+		const vector float G8_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_lower);
+
+
+		const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
+		const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
+		const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
+		const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
+
+		const vector float B1_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_upper);
+		const vector float B2_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_lower);
+		const vector float B3_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_upper);
+		const vector float B4_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_lower);
+		const vector float B5_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_upper);
+		const vector float B6_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_lower);
+		const vector float B7_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_upper);
+		const vector float B8_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_lower);
+
+
+		const vector unsigned int  R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
+		const vector unsigned int  R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
+		const vector unsigned int  R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
+		const vector unsigned int  R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
+		const vector unsigned int  R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
+		const vector unsigned int  R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
+		const vector unsigned int  R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
+		const vector unsigned int  R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
+		const vector unsigned int  R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
+		const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
+		const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
+		const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
+		const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
+		const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
+		const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
+		const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
+
+		const vector unsigned int  G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
+		const vector unsigned int  G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
+		const vector unsigned int  G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
+		const vector unsigned int  G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
+		const vector unsigned int  G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
+		const vector unsigned int  G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
+		const vector unsigned int  G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
+		const vector unsigned int  G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
+		const vector unsigned int  G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
+		const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
+		const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
+		const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
+		const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
+		const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
+		const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
+		const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
+
+		const vector unsigned int  B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
+		const vector unsigned int  B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
+		const vector unsigned int  B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
+		const vector unsigned int  B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
+		const vector unsigned int  B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
+		const vector unsigned int  B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
+		const vector unsigned int  B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
+		const vector unsigned int  B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
+		const vector unsigned int  B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
+		const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
+		const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
+		const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
+		const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
+		const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
+		const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
+		const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
+
+		*((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha,  B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha,  B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha,  B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha,  B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha,  B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha,  B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha,  B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha,  B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha,  B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
+	}
+}
+