# HG changeset patch
# User Martin Lowinski <martin@goldtopf.org>
# Date 1250264984 0
# Node ID 4ead4cef6b7b907b4b59907313793c27f1caaf91
# Parent  abc49915ccb2f54328867bb52a77fdbb915ce529
Rewritten Makefile (fixed bug) and cleanup for yuv2rgb.

diff -r abc49915ccb2 -r 4ead4cef6b7b src/video/ps3/spulibs/Makefile
--- a/src/video/ps3/spulibs/Makefile	Tue Aug 11 14:37:19 2009 +0000
+++ b/src/video/ps3/spulibs/Makefile	Fri Aug 14 15:49:44 2009 +0000
@@ -7,67 +7,41 @@
 SPU_LIBDIR=$(srcdir)/src/video/ps3/spulibs/libs
 SPU_CFLAGS=-g -W -Wall -Winline -Wno-main -I. -I /usr/spu/include -I /opt/cell/sdk/usr/spu/include -finline-limit=10000 -Winline -ftree-vectorize -funroll-loops -fmodulo-sched -ffast-math -fPIC -O2
 
-
-ps3libs: $(SPU_LIBDIR)/libfb_writer_spu.a $(SPU_LIBDIR)/libfb_writer_spu.so \
-				$(SPU_LIBDIR)/libyuv2rgb_spu.so $(SPU_LIBDIR)/libyuv2rgb_spu.a \
-				$(SPU_LIBDIR)/libbilin_scaler_spu.so $(SPU_LIBDIR)/libbilin_scaler_spu.a
-
-$(SPU_LIBDIR):
-	$(SHELL) $(auxdir)/mkinstalldirs $(SPU_LIBDIR)
+DEPS = $(SPU_SRCDIR)/spu_common.h
+LIBS= fb_writer yuv2rgb bilin_scaler
 
-# fb_writer (basically copying from a to b)
-$(SPU_LIBDIR)/fb_writer_spu-embed.o: $(SPU_LIBDIR) $(SPU_SRCDIR)/fb_writer.c $(SPU_SRCDIR)/spu_common.h
-	$(SPU_GCC) $(SPU_CFLAGS) -o $(SPU_LIBDIR)/fb_writer_spu $(SPU_SRCDIR)/fb_writer.c -lm
-	$(EMBEDSPU) -m32 fb_writer_spu $(SPU_LIBDIR)/fb_writer_spu $(SPU_LIBDIR)/fb_writer_spu-embed.o
-
-$(SPU_LIBDIR)/libfb_writer_spu.a: $(SPU_LIBDIR)/fb_writer_spu-embed.o
-	$(AR) -qcs $(SPU_LIBDIR)/libfb_writer_spu.a $(SPU_LIBDIR)/fb_writer_spu-embed.o
-
-$(SPU_LIBDIR)/libfb_writer_spu.so: $(SPU_LIBDIR)/fb_writer_spu-embed.o
-	$(PPU_LD) -o $(SPU_LIBDIR)/libfb_writer_spu.so -shared -soname=libfb_writer_spu.so $(SPU_LIBDIR)/fb_writer_spu-embed.o
+OBJLIBS = $(foreach lib,$(LIBS),lib$(lib)_spu.a)
+SHALIBS = $(foreach lib,$(LIBS),lib$(lib)_spu.so)
 
 
-# yuv2rgb_converter (converting YV12/IYUV to RGB)
-$(SPU_LIBDIR)/yuv2rgb_spu-embed.o: $(SPU_LIBDIR) $(SPU_SRCDIR)/yuv2rgb_converter.c $(SPU_SRCDIR)/spu_common.h
-	$(SPU_GCC) $(SPU_CFLAGS) -o $(SPU_LIBDIR)/yuv2rgb_spu $(SPU_SRCDIR)/yuv2rgb_converter.c -lm
-	$(EMBEDSPU) -m32 yuv2rgb_spu $(SPU_LIBDIR)/yuv2rgb_spu $(SPU_LIBDIR)/yuv2rgb_spu-embed.o
-
-$(SPU_LIBDIR)/libyuv2rgb_spu.a: $(SPU_LIBDIR)/yuv2rgb_spu-embed.o
-	$(AR) -qcs $(SPU_LIBDIR)/libyuv2rgb_spu.a $(SPU_LIBDIR)/yuv2rgb_spu-embed.o
-
-$(SPU_LIBDIR)/libyuv2rgb_spu.so: $(SPU_LIBDIR)/yuv2rgb_spu-embed.o
-	$(PPU_LD) -o $(SPU_LIBDIR)/libyuv2rgb_spu.so -shared -soname=libyuv2rgb_spu.so $(SPU_LIBDIR)/yuv2rgb_spu-embed.o
+ps3libs: $(foreach lib,$(OBJLIBS),$(SPU_LIBDIR)/$(lib)) $(foreach lib,$(SHALIBS),$(SPU_LIBDIR)/$(lib))
 
 
-# bilin_scaler (scaling bilinear YV12/IYUV pictures with resolutions /16)
-$(SPU_LIBDIR)/bilin_scaler_spu-embed.o: $(SPU_LIBDIR) $(SPU_SRCDIR)/bilin_scaler.c $(SPU_SRCDIR)/spu_common.h
-	$(SPU_GCC) $(SPU_CFLAGS) -o $(SPU_LIBDIR)/bilin_scaler_spu $(SPU_SRCDIR)/bilin_scaler.c -lm
-	$(EMBEDSPU) -m32 bilin_scaler_spu $(SPU_LIBDIR)/bilin_scaler_spu $(SPU_LIBDIR)/bilin_scaler_spu-embed.o
+$(SPU_LIBDIR)/lib%_spu.a: $(SPU_LIBDIR)/%-embed.o
+	$(AR) -qcs $@ $<
+
+$(SPU_LIBDIR)/lib%_spu.so: $(SPU_LIBDIR)/%-embed.o
+	$(PPU_LD) -o $@ -shared -soname=$(notdir $@) $<
 
-$(SPU_LIBDIR)/libbilin_scaler_spu.a: $(SPU_LIBDIR)/bilin_scaler_spu-embed.o
-	$(AR) -qcs $(SPU_LIBDIR)/libbilin_scaler_spu.a $(SPU_LIBDIR)/bilin_scaler_spu-embed.o
+$(SPU_LIBDIR)/%-embed.o: $(SPU_LIBDIR)/%.o
+	$(EMBEDSPU) -m32 $(subst -embed.o,,$(notdir $@))_spu $< $@
 
-$(SPU_LIBDIR)/libbilin_scaler_spu.so: $(SPU_LIBDIR)/bilin_scaler_spu-embed.o
-	$(PPU_LD) -o $(SPU_LIBDIR)/libbilin_scaler_spu.so -shared -soname=libbilin_scaler_spu.so $(SPU_LIBDIR)/bilin_scaler_spu-embed.o
+$(SPU_LIBDIR)/%.o: $(SPU_SRCDIR)/%.c $(DEPS)
+	$(SPU_GCC) $(SPU_CFLAGS) -o $@ $< -lm
 
 
-ps3libs-install: $(SPU_LIBDIR)/libfb_writer_spu.a $(SPU_LIBDIR)/libfb_writer_spu.so \
-				$(SPU_LIBDIR)/libyuv2rgb_spu.so $(SPU_LIBDIR)/libyuv2rgb_spu.a \
-				$(SPU_LIBDIR)/libbilin_scaler_spu.so $(SPU_LIBDIR)/libbilin_scaler_spu.a
-	$(INSTALL) -c -m 0755 $(SPU_LIBDIR)/libfb_writer_spu.so $(DESTDIR)$(libdir)/.
-	$(INSTALL) -c -m 0655 $(SPU_LIBDIR)/libfb_writer_spu.a $(DESTDIR)$(libdir)/.
-	$(INSTALL) -c -m 0755 $(SPU_LIBDIR)/libyuv2rgb_spu.so $(DESTDIR)$(libdir)/.
-	$(INSTALL) -c -m 0655 $(SPU_LIBDIR)/libyuv2rgb_spu.a $(DESTDIR)$(libdir)/.
-	$(INSTALL) -c -m 0755 $(SPU_LIBDIR)/libbilin_scaler_spu.so $(DESTDIR)$(libdir)/.
-	$(INSTALL) -c -m 0655 $(SPU_LIBDIR)/libbilin_scaler_spu.a $(DESTDIR)$(libdir)/.
+ps3libs-install: $(foreach obj,$(OBJLIBS),$(SPU_LIBDIR)/$(obj)) $(foreach obj,$(SHALIBS),$(SPU_LIBDIR)/$(obj))
+	for file in $(OBJLIBS); do \
+		$(INSTALL) -c -m 0655 $(SPU_LIBDIR)/$$file $(DESTDIR)$(libdir)/$$file; \
+	done
+	for file in $(SHALIBS); do \
+		$(INSTALL) -c -m 0755 $(SPU_LIBDIR)/$$file $(DESTDIR)$(libdir)/$$file; \
+	done
 
 ps3libs-uninstall:
-	rm -f $(DESTDIR)$(libdir)/libfb_writer_spu.a
-	rm -f $(DESTDIR)$(libdir)/libfb_writer_spu.so
-	rm -f $(DESTDIR)$(libdir)/libyuv2rgb_spu.so
-	rm -f $(DESTDIR)$(libdir)/libyuv2rgb_spu.a
-	rm -f $(DESTDIR)$(libdir)/libbilin_scaler_spu.so
-	rm -f $(DESTDIR)$(libdir)/libbilin_scaler_spu.a
+	for file in $(OBJLIBS) $(SHALIBS); do \
+		rm -f $(DESTDIR)$(libdir)/$$file; \
+	done
 
 ps3libs-clean:
-	rm -f $(SPU_LIBDIR)/* 
+	rm -f $(SPU_LIBDIR)/*
diff -r abc49915ccb2 -r 4ead4cef6b7b src/video/ps3/spulibs/yuv2rgb.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/video/ps3/spulibs/yuv2rgb.c	Fri Aug 14 15:49:44 2009 +0000
@@ -0,0 +1,662 @@
+/*
+ * SDL - Simple DirectMedia Layer
+ * CELL BE Support for PS3 Framebuffer
+ * Copyright (C) 2008, 2009 International Business Machines Corporation
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ *
+ *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
+ *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
+ *  SPE code based on research by:
+ *  Rene Becker
+ *  Thimo Emmerich
+ */
+
+#include "spu_common.h"
+
+#include <spu_intrinsics.h>
+#include <spu_mfcio.h>
+
+// Debugging
+#define DEBUG
+
+// Test environment for /2 resolutions
+#define TESTING
+
+#ifdef DEBUG
+#define deprintf(fmt, args... ) \
+	fprintf( stdout, fmt, ##args ); \
+	fflush( stdout );
+#else
+#define deprintf( fmt, args... )
+#endif
+
+struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
+
+/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
+ * there might be the need to retrieve misaligned data, adjust
+ * incoming v and u plane to be able to handle this (add 128)
+ */
+unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
+unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
+
+/* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
+unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
+
+/* some vectors needed by the float to int conversion */
+static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
+static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
+
+void yuv_to_rgb_w16();
+void yuv_to_rgb_w32();
+
+void yuv_to_rgb_w2_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
+
+
+int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
+{
+	deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
+	uint32_t ea_mfc, mbox;
+	// send ready message
+	spu_write_out_mbox(SPU_READY);
+
+	while (1) {
+		/* Check mailbox */
+		mbox = spu_read_in_mbox();
+		deprintf("[SPU] Message is %u\n", mbox);
+		switch (mbox) {
+			case SPU_EXIT:
+				deprintf("[SPU] yuv2rgb_converter goes down...\n");
+				return 0;
+			case SPU_START:
+				break;
+			default:
+				deprintf("[SPU] Cannot handle message\n");
+				continue;
+		}
+
+		/* Tag Manager setup */
+		unsigned int tag_id;
+		tag_id = mfc_multi_tag_reserve(1);
+		if (tag_id == MFC_TAG_INVALID) {
+			deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
+			return 0;
+		}
+
+		/* DMA transfer for the input parameters */
+		ea_mfc = spu_read_in_mbox();
+		deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
+		spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
+		DMA_WAIT_TAG(tag_id);
+
+		/* There are alignment issues that involve handling of special cases
+		 * a width of 32 results in a width of 16 in the chrominance
+		 * --> choose the proper handling to optimize the performance
+		 */
+		deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
+		if (!(parms_converter.src_pixel_width & 0x1f)) {
+			deprintf("[SPU] Using yuv_to_rgb_w16\n");
+			yuv_to_rgb_w16();
+		} else {
+			deprintf("[SPU] Using yuv_to_rgb_w32\n");
+			yuv_to_rgb_w32();
+		}
+
+		mfc_multi_tag_release(tag_id, 1);
+		deprintf("[SPU] yuv2rgb_spu... done!\n");
+		/* Send FIN message */
+		spu_write_out_mbox(SPU_FIN);
+	}
+
+	return 0;
+}
+
+
+/*
+ * float_to_char()
+ *
+ * converts a float to a character using saturated
+ * arithmetic
+ *
+ * @param s float for conversion
+ * @returns converted character
+ */
+inline static unsigned char float_to_char(float s) {
+	vector float vec_s = spu_splats(s);
+	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+	vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+	vec_s = spu_sel(vec_s, vec_255, select_2);
+	return (unsigned char) spu_extract(vec_s,0);
+}
+
+
+/*
+ * vfloat_to_vuint()
+ *
+ * converts a float vector to an unsinged int vector using saturated
+ * arithmetic
+ *
+ * @param vec_s float vector for conversion
+ * @returns converted unsigned int vector
+ */
+inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
+	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
+	vec_s = spu_sel(vec_s, vec_0_1, select_1);
+
+	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
+	vec_s = spu_sel(vec_s, vec_255, select_2);
+	return spu_convtu(vec_s,0);
+}
+
+
+void yuv_to_rgb_w16() {
+	// Pixel dimensions of the picture
+	uint32_t width, height;
+
+	// Extract parameters
+	width = parms_converter.src_pixel_width;
+	height = parms_converter.src_pixel_height;
+
+	// Plane data management
+	// Y
+	unsigned char* ram_addr_y = parms_converter.y_plane;
+	// V
+	unsigned char* ram_addr_v = parms_converter.v_plane;
+	// U
+	unsigned char* ram_addr_u = parms_converter.u_plane;
+
+	// BGRA
+	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+
+	// Strides
+	unsigned int stride_y = width;
+	unsigned int stride_vu = width>>1;
+
+	// Buffer management
+	unsigned int buf_idx = 0;
+	unsigned int size_4lines_y = stride_y<<2;
+	unsigned int size_2lines_y = stride_y<<1;
+	unsigned int size_2lines_vu = stride_vu<<1;
+
+	// 2*width*4byte_per_pixel
+	unsigned int size_2lines_bgra = width<<3;
+
+
+	// start double-buffered processing
+	// 4 lines y
+	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+	// 2 lines v
+	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+	// 2 lines u
+	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+	// Wait for these transfers to be completed
+	DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+	unsigned int i;
+	for(i=0; i<(height>>2)-1; i++) {
+
+		buf_idx^=1;
+
+		// 4 lines y
+		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+		// 2 lines v
+		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+		// 2 lines u
+		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
+
+		DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+		buf_idx^=1;
+
+
+		// Convert YUV to BGRA, store it back (first two lines)
+#ifndef TESTING
+		yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+		// Next two lines
+		yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+				v_plane[buf_idx] + stride_vu,
+				u_plane[buf_idx] + stride_vu,
+				bgra + size_2lines_bgra,
+				width);
+#else
+		yuv_to_rgb_w2_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+		// Next two lines
+		yuv_to_rgb_w2_line(y_plane[buf_idx] + size_2lines_y,
+				v_plane[buf_idx] + stride_vu,
+				u_plane[buf_idx] + stride_vu,
+				bgra + size_2lines_bgra,
+				width);
+#endif
+
+		// Wait for previous storing transfer to be completed
+		DMA_WAIT_TAG(STR_BUF);
+
+		// Store converted lines in two steps->max transfer size 16384
+		spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+		spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+
+		// Move 4 lines
+		ram_addr_y += size_4lines_y;
+		ram_addr_v += size_2lines_vu;
+		ram_addr_u += size_2lines_vu;
+
+		buf_idx^=1;
+	}
+
+#ifndef TESTING
+	// Convert YUV to BGRA, store it back (first two lines)
+	yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+	// Next two lines
+	yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
+			v_plane[buf_idx] + stride_vu,
+			u_plane[buf_idx] + stride_vu,
+			bgra + size_2lines_bgra,
+			width);
+#else
+	// Convert YUV to BGRA, store it back (first two lines)
+	yuv_to_rgb_w2_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+	// Next two lines
+	yuv_to_rgb_w2_line(y_plane[buf_idx] + size_2lines_y,
+			v_plane[buf_idx] + stride_vu,
+			u_plane[buf_idx] + stride_vu,
+			bgra + size_2lines_bgra,
+			width);
+#endif
+
+	// Wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+	ram_addr_bgra += size_2lines_bgra;
+	spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+
+	// wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+
+}
+
+
+void yuv_to_rgb_w32() {
+	// Pixel dimensions of the picture
+	uint32_t width, height;
+
+	// Extract parameters
+	width = parms_converter.src_pixel_width;
+	height = parms_converter.src_pixel_height;
+
+	// Plane data management
+	// Y
+	unsigned char* ram_addr_y = parms_converter.y_plane;
+	// V
+	unsigned char* ram_addr_v = parms_converter.v_plane;
+	// U
+	unsigned char* ram_addr_u = parms_converter.u_plane;
+
+	// BGRA
+	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
+
+	// Strides
+	unsigned int stride_y = width;
+	unsigned int stride_vu = width>>1;
+
+	// Buffer management
+	unsigned int buf_idx = 0;
+	unsigned int size_4lines_y = stride_y<<2;
+	unsigned int size_2lines_y = stride_y<<1;
+	unsigned int size_2lines_vu = stride_vu<<1;
+
+	// 2*width*4byte_per_pixel
+	unsigned int size_2lines_bgra = width<<3;
+
+	// start double-buffered processing
+	// 4 lines y
+	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+	// 2 lines v
+	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+	// 2 lines u
+	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+
+	// Wait for these transfers to be completed
+	DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+	unsigned int i;
+	for(i=0; i < (height>>2)-1; i++) {
+		buf_idx^=1;
+		// 4 lines y
+		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
+		deprintf("4lines = %d\n", size_4lines_y);
+		// 2 lines v
+		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+		deprintf("2lines = %d\n", size_2lines_vu);
+		// 2 lines u
+		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
+		deprintf("2lines = %d\n", size_2lines_vu);
+
+		DMA_WAIT_TAG((RETR_BUF + buf_idx));
+
+		buf_idx^=1;
+
+		// Convert YUV to BGRA, store it back (first two lines)
+		yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+		// Next two lines
+		yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+				v_plane[buf_idx] + stride_vu,
+				u_plane[buf_idx] + stride_vu,
+				bgra + size_2lines_bgra,
+				width);
+
+		// Wait for previous storing transfer to be completed
+		DMA_WAIT_TAG(STR_BUF);
+
+		// Store converted lines in two steps->max transfer size 16384
+		spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+		spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+		ram_addr_bgra += size_2lines_bgra;
+
+		// Move 4 lines
+		ram_addr_y += size_4lines_y;
+		ram_addr_v += size_2lines_vu;
+		ram_addr_u += size_2lines_vu;
+
+		buf_idx^=1;
+	}
+
+	// Convert YUV to BGRA, store it back (first two lines)
+	yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
+
+	// Next two lines
+	yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
+			v_plane[buf_idx] + stride_vu,
+			u_plane[buf_idx] + stride_vu,
+			bgra + size_2lines_bgra,
+			width);
+
+	// Wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+	ram_addr_bgra += size_2lines_bgra;
+	spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
+
+	// Wait for previous storing transfer to be completed
+	DMA_WAIT_TAG(STR_BUF);
+}
+
+
+/* Some vectors needed by the yuv 2 rgb conversion algorithm */
+const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
+const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
+const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
+const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
+const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
+
+const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
+const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
+const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
+const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
+
+const vector unsigned int vec_alpha =  { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
+
+const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
+const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
+
+
+#ifdef TESTING
+/*
+ * yuv_to_rgb_w2()
+ *
+ * - converts x * 4 pixels from YUV to RGB
+ * - two lines of YUV are taken as input.
+ * - width has to be a multiple of 2 (= 4 pixel)
+ *
+ * @param y_addr address of the y plane (local store)
+ * @param v_addr address of the v plane (local store)
+ * @param u_addr address of the u plane (local store)
+ * @param bgra_addr_char address of the bgra output buffer (local store)
+ * @param width the width of a line in pixel
+ */
+void yuv_to_rgb_w2_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_char, unsigned int width) {
+	// each pixel is stored as an integer
+	unsigned int* bgra_addr = (unsigned int*) bgra_addr_char;
+
+	unsigned int x;
+	// Go through each line in steps of 2, because every U and V value is connected to 4 pixels Y (YUV 4:2:0)
+	for(x = 0; x < width; x+=2) {
+		// Get the 4 Y, 1 U and 1 V values
+		const unsigned char Y_1 = *(y_addr + x);
+		const unsigned char Y_2 = *(y_addr + x + 1);
+		const unsigned char Y_3 = *(y_addr + x + width);
+		const unsigned char Y_4 = *(y_addr + x + width + 1);
+		const unsigned char U = *(u_addr + (x >> 1));
+		const unsigned char V = *(v_addr + (x >> 1));
+
+		// Start converting
+		float V_minus_128 = (float)((float)V - 128.0f);
+		float U_minus_128 = (float)((float)U - 128.0f);
+
+		float R_precalculate = 1.403f * V_minus_128;
+		float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
+		float B_precalculate = 1.773f * U_minus_128;
+
+		// Cast the results
+		const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
+		const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
+		const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
+		const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
+		const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
+		const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
+		const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
+		const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
+		const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
+		const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
+		const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
+		const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
+
+		// Write back
+		*(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
+		*(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
+		*(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
+		*(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
+	}
+}
+#endif
+
+
+/*
+ * yuv_to_rgb_w32()
+ *
+ * processes to line of yuv-input, width has to be a multiple of 32
+ * two lines of yuv are taken as input
+ *
+ * @param y_addr address of the y plane in local store
+ * @param v_addr address of the v plane in local store
+ * @param u_addr address of the u plane in local store
+ * @param bgra_addr_ address of the bgra output buffer
+ * @param width the width in pixel
+ */
+void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
+	// each pixel is stored as an integer
+	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
+
+	unsigned int x;
+	for(x = 0; x < width; x+=32) {
+		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
+
+		const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
+		const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
+		const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
+		const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
+		const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
+		const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
+
+		const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
+		const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
+		const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
+		const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
+
+		const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
+		const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
+		const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
+		const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
+
+		vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
+		vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
+		vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
+		vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
+		vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
+		vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
+		vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
+		vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
+		vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
+		vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
+		vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
+		vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
+		vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
+		vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
+		vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
+		vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
+
+		const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
+		const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
+		const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
+		const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
+
+		const vector float R1_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_upper);
+		const vector float R2_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_lower);
+		const vector float R3_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_upper);
+		const vector float R4_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_lower);
+		const vector float R5_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_upper);
+		const vector float R6_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_lower);
+		const vector float R7_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_upper);
+		const vector float R8_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_lower);
+
+
+		const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
+		const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
+		const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
+		const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
+
+		const vector float G1_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_upper);
+		const vector float G2_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_lower);
+		const vector float G3_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_upper);
+		const vector float G4_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_lower);
+		const vector float G5_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_upper);
+		const vector float G6_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_lower);
+		const vector float G7_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_upper);
+		const vector float G8_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_lower);
+
+
+		const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
+		const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
+		const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
+		const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
+
+		const vector float B1_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_upper);
+		const vector float B2_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_lower);
+		const vector float B3_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_upper);
+		const vector float B4_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_lower);
+		const vector float B5_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_upper);
+		const vector float B6_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_lower);
+		const vector float B7_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_upper);
+		const vector float B8_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_lower);
+
+
+		const vector unsigned int  R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
+		const vector unsigned int  R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
+		const vector unsigned int  R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
+		const vector unsigned int  R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
+		const vector unsigned int  R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
+		const vector unsigned int  R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
+		const vector unsigned int  R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
+		const vector unsigned int  R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
+		const vector unsigned int  R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
+		const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
+		const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
+		const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
+		const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
+		const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
+		const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
+		const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
+
+		const vector unsigned int  G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
+		const vector unsigned int  G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
+		const vector unsigned int  G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
+		const vector unsigned int  G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
+		const vector unsigned int  G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
+		const vector unsigned int  G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
+		const vector unsigned int  G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
+		const vector unsigned int  G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
+		const vector unsigned int  G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
+		const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
+		const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
+		const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
+		const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
+		const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
+		const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
+		const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
+
+		const vector unsigned int  B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
+		const vector unsigned int  B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
+		const vector unsigned int  B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
+		const vector unsigned int  B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
+		const vector unsigned int  B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
+		const vector unsigned int  B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
+		const vector unsigned int  B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
+		const vector unsigned int  B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
+		const vector unsigned int  B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
+		const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
+		const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
+		const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
+		const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
+		const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
+		const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
+		const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
+
+		*((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha,  B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha,  B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha,  B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha,  B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha,  B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha,  B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha,  B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
+		*((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha,  B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha,  B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
+		*((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
+	}
+}
+
diff -r abc49915ccb2 -r 4ead4cef6b7b src/video/ps3/spulibs/yuv2rgb_converter.c
--- a/src/video/ps3/spulibs/yuv2rgb_converter.c	Tue Aug 11 14:37:19 2009 +0000
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,629 +0,0 @@
-/*
- * SDL - Simple DirectMedia Layer
- * CELL BE Support for PS3 Framebuffer
- * Copyright (C) 2008, 2009 International Business Machines Corporation
- *
- * This library is free software; you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
- * USA
- *
- *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
- *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
- *  SPE code based on research by:
- *  Rene Becker
- *  Thimo Emmerich
- */
-
-#include "spu_common.h"
-
-#include <spu_intrinsics.h>
-#include <spu_mfcio.h>
-
-// Debugging
-//#define DEBUG
-
-#ifdef DEBUG
-#define deprintf(fmt, args... ) \
-	fprintf( stdout, fmt, ##args ); \
-	fflush( stdout );
-#else
-#define deprintf( fmt, args... )
-#endif
-
-struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128)));
-
-/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
- * there might be the need to retrieve misaligned data, adjust
- * incoming v and u plane to be able to handle this (add 128)
- */
-unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128)));
-unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
-unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128)));
-
-/* A maximum of 4 lines BGRA are stored, 4 byte per pixel */
-unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128)));
-
-/* some vectors needed by the float to int conversion */
-static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
-static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };
-
-void yuv_to_rgb_w16();
-void yuv_to_rgb_w32();
-
-void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width);
-void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width);
-
-
-int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused)))
-{
-	deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id);
-	uint32_t ea_mfc, mbox;
-	// send ready message
-	spu_write_out_mbox(SPU_READY);
-
-	while (1) {
-		/* Check mailbox */
-		mbox = spu_read_in_mbox();
-		deprintf("[SPU] Message is %u\n", mbox);
-		switch (mbox) {
-			case SPU_EXIT:
-				deprintf("[SPU] fb_writer goes down...\n");
-				return 0;
-			case SPU_START:
-				break;
-			default:
-				deprintf("[SPU] Cannot handle message\n");
-				continue;
-		}
-
-		/* Tag Manager setup */
-		unsigned int tag_id;
-		tag_id = mfc_multi_tag_reserve(1);
-		if (tag_id == MFC_TAG_INVALID) {
-			deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n");
-			return 0;
-		}
-
-		/* DMA transfer for the input parameters */
-		ea_mfc = spu_read_in_mbox();
-		deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc);
-		spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD);
-		DMA_WAIT_TAG(tag_id);
-
-		/* There are alignment issues that involve handling of special cases
-		 * a width of 32 results in a width of 16 in the chrominance
-		 * --> choose the proper handling to optimize the performance
-		 */
-		deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height);
-		if (parms_converter.src_pixel_width & 0x1f) {
-			deprintf("[SPU] Using yuv_to_rgb_w16\n");
-			yuv_to_rgb_w16();
-		} else {
-			deprintf("[SPU] Using yuv_to_rgb_w32\n");
-			yuv_to_rgb_w32();
-		}
-
-		mfc_multi_tag_release(tag_id, 1);
-		deprintf("[SPU] yuv2rgb_spu... done!\n");
-		/* Send FIN message */
-		spu_write_out_mbox(SPU_FIN);
-	}
-
-	return 0;
-}
-
-
-/*
- * float_to_char()
- *
- * converts a float to a character using saturated
- * arithmetic
- *
- * @param s float for conversion
- * @returns converted character
- */
-inline static unsigned char float_to_char(float s) {
-	vector float vec_s = spu_splats(s);
-	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
-	vec_s = spu_sel(vec_s, vec_0_1, select_1);
-
-	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
-	vec_s = spu_sel(vec_s, vec_255, select_2);
-	return (unsigned char) spu_extract(vec_s,0);
-}
-
-
-/*
- * vfloat_to_vuint()
- *
- * converts a float vector to an unsinged int vector using saturated
- * arithmetic
- *
- * @param vec_s float vector for conversion
- * @returns converted unsigned int vector
- */
-inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
-	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
-	vec_s = spu_sel(vec_s, vec_0_1, select_1);
-
-	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
-	vec_s = spu_sel(vec_s, vec_255, select_2);
-	return spu_convtu(vec_s,0);
-}
-
-
-void yuv_to_rgb_w16() {
-	// Pixel dimensions of the picture
-	uint32_t width, height;
-
-	// Extract parameters
-	width = parms_converter.src_pixel_width;
-	height = parms_converter.src_pixel_height;
-
-	// Plane data management
-	// Y
-	unsigned char* ram_addr_y = parms_converter.y_plane;
-	// V
-	unsigned char* ram_addr_v = parms_converter.v_plane;
-	// U
-	unsigned char* ram_addr_u = parms_converter.u_plane;
-
-	// BGRA
-	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
-
-	// Strides
-	unsigned int stride_y = width;
-	unsigned int stride_vu = width>>1;
-
-	// Buffer management
-	unsigned int buf_idx = 0;
-	unsigned int size_4lines_y = stride_y<<2;
-	unsigned int size_2lines_y = stride_y<<1;
-	unsigned int size_2lines_vu = stride_vu<<1;
-
-	// 2*width*4byte_per_pixel
-	unsigned int size_2lines_bgra = width<<3;
-
-
-	// start double-buffered processing
-	// 4 lines y
-	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
-
-	// 2 lines v
-	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
-
-	// 2 lines u
-	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
-
-	// Wait for these transfers to be completed
-	DMA_WAIT_TAG((RETR_BUF + buf_idx));
-
-	unsigned int i;
-	for(i=0; i<(height>>2)-1; i++) {
-
-		buf_idx^=1;
-
-		// 4 lines y
-		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD);
-
-		// 2 lines v
-		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
-
-		// 2 lines u
-		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD);
-
-		DMA_WAIT_TAG((RETR_BUF + buf_idx));
-
-		buf_idx^=1;
-
-
-		// Convert YUV to BGRA, store it back (first two lines)
-		yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
-
-		// Next two lines
-		yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
-				v_plane[buf_idx] + stride_vu,
-				u_plane[buf_idx] + stride_vu,
-				bgra + size_2lines_bgra,
-				width);
-
-		// Wait for previous storing transfer to be completed
-		DMA_WAIT_TAG(STR_BUF);
-
-		// Store converted lines in two steps->max transfer size 16384
-		spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
-		ram_addr_bgra += size_2lines_bgra;
-		spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
-		ram_addr_bgra += size_2lines_bgra;
-
-		// Move 4 lines
-		ram_addr_y += size_4lines_y;
-		ram_addr_v += size_2lines_vu;
-		ram_addr_u += size_2lines_vu;
-
-		buf_idx^=1;
-	}
-
-	// Convert YUV to BGRA, store it back (first two lines)
-	yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
-
-	// Next two lines
-	yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y,
-			v_plane[buf_idx] + stride_vu,
-			u_plane[buf_idx] + stride_vu,
-			bgra + size_2lines_bgra,
-			width);
-
-	// Wait for previous storing transfer to be completed
-	DMA_WAIT_TAG(STR_BUF);
-	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
-	ram_addr_bgra += size_2lines_bgra;
-	spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
-
-	// wait for previous storing transfer to be completed
-	DMA_WAIT_TAG(STR_BUF);
-
-}
-
-
-void yuv_to_rgb_w32() {
-	// Pixel dimensions of the picture
-	uint32_t width, height;
-
-	// Extract parameters
-	width = parms_converter.src_pixel_width;
-	height = parms_converter.src_pixel_height;
-
-	// Plane data management
-	// Y
-	unsigned char* ram_addr_y = parms_converter.y_plane;
-	// V
-	unsigned char* ram_addr_v = parms_converter.v_plane;
-	// U
-	unsigned char* ram_addr_u = parms_converter.u_plane;
-
-	// BGRA
-	unsigned char* ram_addr_bgra = parms_converter.dstBuffer;
-
-	// Strides
-	unsigned int stride_y = width;
-	unsigned int stride_vu = width>>1;
-
-	// Buffer management
-	unsigned int buf_idx = 0;
-	unsigned int size_4lines_y = stride_y<<2;
-	unsigned int size_2lines_y = stride_y<<1;
-	unsigned int size_2lines_vu = stride_vu<<1;
-
-	// 2*width*4byte_per_pixel
-	unsigned int size_2lines_bgra = width<<3;
-
-	// start double-buffered processing
-	// 4 lines y
-	spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
-	// 2 lines v
-	spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
-	// 2 lines u
-	spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
-
-	// Wait for these transfers to be completed
-	DMA_WAIT_TAG((RETR_BUF + buf_idx));
-
-	unsigned int i;
-	for(i=0; i < (height>>2)-1; i++) {
-		buf_idx^=1;
-		// 4 lines y
-		spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD);
-		deprintf("4lines = %d\n", size_4lines_y);
-		// 2 lines v
-		spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
-		deprintf("2lines = %d\n", size_2lines_vu);
-		// 2 lines u
-		spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD);
-		deprintf("2lines = %d\n", size_2lines_vu);
-
-		DMA_WAIT_TAG((RETR_BUF + buf_idx));
-
-		buf_idx^=1;
-
-		// Convert YUV to BGRA, store it back (first two lines)
-		yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
-
-		// Next two lines
-		yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
-				v_plane[buf_idx] + stride_vu,
-				u_plane[buf_idx] + stride_vu,
-				bgra + size_2lines_bgra,
-				width);
-
-		// Wait for previous storing transfer to be completed
-		DMA_WAIT_TAG(STR_BUF);
-
-		// Store converted lines in two steps->max transfer size 16384
-		spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
-		ram_addr_bgra += size_2lines_bgra;
-		spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
-		ram_addr_bgra += size_2lines_bgra;
-
-		// Move 4 lines
-		ram_addr_y += size_4lines_y;
-		ram_addr_v += size_2lines_vu;
-		ram_addr_u += size_2lines_vu;
-
-		buf_idx^=1;
-	}
-
-	// Convert YUV to BGRA, store it back (first two lines)
-	yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width);
-
-	// Next two lines
-	yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y,
-			v_plane[buf_idx] + stride_vu,
-			u_plane[buf_idx] + stride_vu,
-			bgra + size_2lines_bgra,
-			width);
-
-	// Wait for previous storing transfer to be completed
-	DMA_WAIT_TAG(STR_BUF);
-	spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
-	ram_addr_bgra += size_2lines_bgra;
-	spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD);
-
-	// Wait for previous storing transfer to be completed
-	DMA_WAIT_TAG(STR_BUF);
-}
-
-
-/* Some vectors needed by the yuv 2 rgb conversion algorithm */
-const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f };
-const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
-const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 };
-const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 };
-const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B };
-const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F };
-
-const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f};
-const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f};
-const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f};
-const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f};
-
-const vector unsigned int vec_alpha =  { 255 << 24, 255 << 24, 255 << 24, 255 << 24 };
-
-const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 };
-const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F };
-
-
-/*
- * yuv_to_rgb_w16()
- *
- * processes to line of yuv-input, width has to be a multiple of 16
- * two lines of yuv are taken as input
- *
- * @param y_addr address of the y plane in local store
- * @param v_addr address of the v plane in local store
- * @param u_addr address of the u plane in local store
- * @param bgra_addr_ address of the bgra output buffer
- * @param width the width in pixel
- */
-void yuv_to_rgb_w16_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
-	// each pixel is stored as an integer
-	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
-
-	unsigned int x;
-	for(x = 0; x < width; x+=2) {
-		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
-		const unsigned char Y_1 = *(y_addr + x);
-		const unsigned char Y_2 = *(y_addr + x + 1);
-		const unsigned char Y_3 = *(y_addr + x + width);
-		const unsigned char Y_4 = *(y_addr + x + width + 1);
-		const unsigned char U = *(u_addr + (x >> 1));
-		const unsigned char V = *(v_addr + (x >> 1));
-
-		float V_minus_128 = (float)((float)V - 128.0f);
-		float U_minus_128 = (float)((float)U - 128.0f);
-
-		float R_precalculate = 1.403f * V_minus_128;
-		float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128);
-		float B_precalculate = 1.773f * U_minus_128;
-
-		const unsigned char R_1 = float_to_char((Y_1 + R_precalculate));
-		const unsigned char R_2 = float_to_char((Y_2 + R_precalculate));
-		const unsigned char R_3 = float_to_char((Y_3 + R_precalculate));
-		const unsigned char R_4 = float_to_char((Y_4 + R_precalculate));
-		const unsigned char G_1 = float_to_char((Y_1 + G_precalculate));
-		const unsigned char G_2 = float_to_char((Y_2 + G_precalculate));
-		const unsigned char G_3 = float_to_char((Y_3 + G_precalculate));
-		const unsigned char G_4 = float_to_char((Y_4 + G_precalculate));
-		const unsigned char B_1 = float_to_char((Y_1 + B_precalculate));
-		const unsigned char B_2 = float_to_char((Y_2 + B_precalculate));
-		const unsigned char B_3 = float_to_char((Y_3 + B_precalculate));
-		const unsigned char B_4 = float_to_char((Y_4 + B_precalculate));
-
-		*(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24);
-		*(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24);
-		*(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24);
-		*(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24);
-	}
-}
-
-
-/*
- * yuv_to_rgb_w32()
- *
- * processes to line of yuv-input, width has to be a multiple of 32
- * two lines of yuv are taken as input
- *
- * @param y_addr address of the y plane in local store
- * @param v_addr address of the v plane in local store
- * @param u_addr address of the u plane in local store
- * @param bgra_addr_ address of the bgra output buffer
- * @param width the width in pixel
- */
-void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) {
-	// each pixel is stored as an integer
-	unsigned int* bgra_addr = (unsigned int*) bgra_addr_;
-
-	unsigned int x;
-	for(x = 0; x < width; x+=32) {
-		// Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt
-
-		const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x));
-		const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16));
-		const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width));
-		const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16));
-		const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1)));
-		const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1)));
-
-		const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128);
-		const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128);
-		const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128);
-		const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128);
-
-		const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128);
-		const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128);
-		const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128);
-		const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128);
-
-		vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0);
-		vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0);
-		vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0);
-		vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0);
-		vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0);
-		vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0);
-		vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0);
-		vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0);
-		vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0);
-		vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0);
-		vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0);
-		vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0);
-		vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0);
-		vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0);
-		vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0);
-		vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0);
-
-		const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1);
-		const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2);
-		const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3);
-		const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4);
-
-		const vector float R1_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_upper);
-		const vector float R2_precalculate = spu_shuffle(R1a_precalculate,  R1a_precalculate, vec_select_floats_lower);
-		const vector float R3_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_upper);
-		const vector float R4_precalculate = spu_shuffle(R2a_precalculate,  R2a_precalculate, vec_select_floats_lower);
-		const vector float R5_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_upper);
-		const vector float R6_precalculate = spu_shuffle(R3a_precalculate,  R3a_precalculate, vec_select_floats_lower);
-		const vector float R7_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_upper);
-		const vector float R8_precalculate = spu_shuffle(R4a_precalculate,  R4a_precalculate, vec_select_floats_lower);
-
-
-		const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff));
-		const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff));
-		const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff));
-		const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff));
-
-		const vector float G1_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_upper);
-		const vector float G2_precalculate = spu_shuffle(G1a_precalculate,  G1a_precalculate, vec_select_floats_lower);
-		const vector float G3_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_upper);
-		const vector float G4_precalculate = spu_shuffle(G2a_precalculate,  G2a_precalculate, vec_select_floats_lower);
-		const vector float G5_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_upper);
-		const vector float G6_precalculate = spu_shuffle(G3a_precalculate,  G3a_precalculate, vec_select_floats_lower);
-		const vector float G7_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_upper);
-		const vector float G8_precalculate = spu_shuffle(G4a_precalculate,  G4a_precalculate, vec_select_floats_lower);
-
-
-		const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1);
-		const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2);
-		const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3);
-		const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4);
-
-		const vector float B1_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_upper);
-		const vector float B2_precalculate = spu_shuffle(B1a_precalculate,  B1a_precalculate, vec_select_floats_lower);
-		const vector float B3_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_upper);
-		const vector float B4_precalculate = spu_shuffle(B2a_precalculate,  B2a_precalculate, vec_select_floats_lower);
-		const vector float B5_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_upper);
-		const vector float B6_precalculate = spu_shuffle(B3a_precalculate,  B3a_precalculate, vec_select_floats_lower);
-		const vector float B7_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_upper);
-		const vector float B8_precalculate = spu_shuffle(B4a_precalculate,  B4a_precalculate, vec_select_floats_lower);
-
-
-		const vector unsigned int  R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate));
-		const vector unsigned int  R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate));
-		const vector unsigned int  R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate));
-		const vector unsigned int  R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate));
-		const vector unsigned int  R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate));
-		const vector unsigned int  R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate));
-		const vector unsigned int  R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate));
-		const vector unsigned int  R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate));
-		const vector unsigned int  R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate));
-		const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate));
-		const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate));
-		const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate));
-		const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate));
-		const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate));
-		const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate));
-		const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate));
-
-		const vector unsigned int  G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate));
-		const vector unsigned int  G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate));
-		const vector unsigned int  G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate));
-		const vector unsigned int  G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate));
-		const vector unsigned int  G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate));
-		const vector unsigned int  G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate));
-		const vector unsigned int  G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate));
-		const vector unsigned int  G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate));
-		const vector unsigned int  G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate));
-		const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate));
-		const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate));
-		const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate));
-		const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate));
-		const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate));
-		const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate));
-		const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate));
-
-		const vector unsigned int  B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate));
-		const vector unsigned int  B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate));
-		const vector unsigned int  B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate));
-		const vector unsigned int  B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate));
-		const vector unsigned int  B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate));
-		const vector unsigned int  B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate));
-		const vector unsigned int  B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate));
-		const vector unsigned int  B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate));
-		const vector unsigned int  B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate));
-		const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate));
-		const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate));
-		const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate));
-		const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate));
-		const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate));
-		const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate));
-		const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate));
-
-		*((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha,  B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1)));
-		*((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha,  B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1)));
-		*((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha,  B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1)));
-		*((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha,  B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1)));
-		*((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha,  B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1)));
-		*((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha,  B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1)));
-		*((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha,  B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1)));
-		*((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha,  B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1)));
-		*((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha,  B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1)));
-		*((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1)));
-		*((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1)));
-		*((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1)));
-		*((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1)));
-		*((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1)));
-		*((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1)));
-		*((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1)));
-	}
-}
-