view src/video/ps3/spulibs/bilin_scaler.c @ 4165:3b8ac3d311a2 SDL-1.2

Hello. This patch provides basic support for video on the Sony PS3 Linux framebuffer. Scaling, format-conversion, and drawing is done from the SPEs, so there is little performance impact to PPE applications. This is by no means production quality code, but it is a very good start and a good example of how to use the PS3's hardware capabilities to accelerate video playback on the box. The driver has been verified to work with ffplay, mplayer and xine. This piece of software has been developed at the IBM R&D Lab in Boeblingen, Germany and is now returned to the community. Enjoy ! Signed-off-by: D.Herrendoerfer < d.herrendoerfer [at] de [dot] ibm [dot] com >
author Sam Lantinga <slouken@libsdl.org>
date Thu, 02 Apr 2009 04:06:55 +0000
parents
children
line wrap: on
line source

/*
 * SDL - Simple DirectMedia Layer
 * CELL BE Support for PS3 Framebuffer
 * Copyright (C) 2008, 2009 International Business Machines Corporation
 *
 * This library is free software; you can redistribute it and/or modify it
 * under the terms of the GNU Lesser General Public License as published
 * by the Free Software Foundation; either version 2.1 of the License, or
 * (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
 * USA
 *
 *  Martin Lowinski  <lowinski [at] de [dot] ibm [ibm] com>
 *  Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com>
 *  SPE code based on research by:
 *  Rene Becker
 *  Thimo Emmerich
 */

#include "spu_common.h"

#include <spu_intrinsics.h>
#include <spu_mfcio.h>

// Debugging
//#define DEBUG

#ifdef DEBUG
#define deprintf(fmt, args... ) \
	fprintf( stdout, fmt, ##args ); \
	fflush( stdout );
#else
#define deprintf( fmt, args... )
#endif

struct scale_parms_t parms __attribute__((aligned(128)));

/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored
 * there might be the need to retrieve misaligned data, adjust
 * incoming v and u plane to be able to handle this (add 128)
 */
unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128)));
unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));
unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128)));

/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */
unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128)));
unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));
unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128)));

/* some vectors needed by the float to int conversion */
static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f };
static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f };

void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);
void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride);

void scale_srcw16_dstw16();
void scale_srcw16_dstw32();
void scale_srcw32_dstw16();
void scale_srcw32_dstw32();

int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp )
{
	deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id);
	/* DMA transfer for the input parameters */
	spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD);
	DMA_WAIT_TAG(TAG_INIT);

	deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height,
			parms.dst_pixel_width, parms.dst_pixel_height);

	if(parms.src_pixel_width & 0x1f) {
		if(parms.dst_pixel_width & 0x1F) {
			deprintf("[SPU] Using scale_srcw16_dstw16\n");
			scale_srcw16_dstw16();
		} else {
			deprintf("[SPU] Using scale_srcw16_dstw32\n");
			scale_srcw16_dstw32();
		}
	} else {
		if(parms.dst_pixel_width & 0x1F) {
			deprintf("[SPU] Using scale_srcw32_dstw16\n");
			scale_srcw32_dstw16();
		} else {
			deprintf("[SPU] Using scale_srcw32_dstw32\n");
			scale_srcw32_dstw32();
		}
	}
	deprintf("[SPU] bilin_scaler_spu... done!\n");

	return 0;
}


/*
 * vfloat_to_vuint()
 *
 * converts a float vector to an unsinged int vector using saturated
 * arithmetic
 *
 * @param vec_s float vector for conversion
 * @returns converted unsigned int vector
 */
inline static vector unsigned int vfloat_to_vuint(vector float vec_s) {
	vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s);
	vec_s = spu_sel(vec_s, vec_0_1, select_1);

	vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255);
	vec_s = spu_sel(vec_s, vec_255, select_2);
	return spu_convtu(vec_s,0);
}


/*
 * scale_srcw16_dstw16()
 *
 * processes an input image of width 16
 * scaling is done to a width 16
 * result stored in RAM
 */
void scale_srcw16_dstw16() {
	// extract parameters
	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;

	unsigned int src_width = parms.src_pixel_width;
	unsigned int src_height = parms.src_pixel_height;
	unsigned int dst_width = parms.dst_pixel_width;
	unsigned int dst_height = parms.dst_pixel_height;

	// YVU
	unsigned int src_linestride_y = src_width;
	unsigned int src_dbl_linestride_y = src_width<<1;
	unsigned int src_linestride_vu = src_width>>1;
	unsigned int src_dbl_linestride_vu = src_width;

	// scaled YVU
	unsigned int scaled_src_linestride_y = dst_width;

	// ram addresses
	unsigned char* src_addr_y = parms.y_plane;
	unsigned char* src_addr_v = parms.v_plane;
	unsigned char* src_addr_u = parms.u_plane;

	// for handling misalignment, addresses are precalculated
	unsigned char* precalc_src_addr_v = src_addr_v;
	unsigned char* precalc_src_addr_u = src_addr_u;

	unsigned int dst_picture_size = dst_width*dst_height;

	// Sizes for destination
	unsigned int dst_dbl_linestride_y = dst_width<<1;
	unsigned int dst_dbl_linestride_vu = dst_width>>1;

	// Perform address calculation for Y, V and U in main memory with dst_addr as base
	unsigned char* dst_addr_main_memory_y = dst_addr;
	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);

	// calculate scale factors
	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
	float y_scale = (float)src_height/(float)dst_height;

	// double buffered processing
	// buffer switching
	unsigned int curr_src_idx = 0;
	unsigned int curr_dst_idx = 0;
	unsigned int next_src_idx, next_dst_idx;

	// 2 lines y as output, upper and lowerline
	unsigned int curr_interpl_y_upper = 0;
	unsigned int next_interpl_y_upper;
	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
	// only 1 line v/u output, both planes have the same dimension
	unsigned int curr_interpl_vu = 0;
	unsigned int next_interpl_vu;

	// weights, calculated in every loop iteration
	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
	vector float vf_next_NSweight_y_upper;
	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
	vector float vf_next_NSweight_vu;

	// line indices for the src picture
	float curr_src_y_upper = 0.0f, next_src_y_upper;
	float curr_src_y_lower, next_src_y_lower;
	float curr_src_vu = 0.0f, next_src_vu;

	// line indices for the dst picture
	unsigned int dst_y=0, dst_vu=0;

	// offset for the v and u plane to handle misalignement
	unsigned int curr_lsoff_v = 0, next_lsoff_v;
	unsigned int curr_lsoff_u = 0, next_lsoff_u;

	// calculate lower line indices
	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
	// lower line weight
	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );


	// start partially double buffered processing
	// get initial data, 2 sets of y, 1 set v, 1 set u
	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
			src_dbl_linestride_y,
			RETR_BUF,
			0, 0 );
	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );

	/* iteration loop
	 * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
	 * the scaled output is 2 lines y, 1 line v, 1 line u
	 * the yuv2rgb-converted output is stored to RAM
	 */
	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
		dst_y = dst_vu<<1;

		// calculate next indices
		next_src_vu = ((float)dst_vu+1)*y_scale;
		next_src_y_upper = ((float)dst_y+2)*y_scale;
		next_src_y_lower = ((float)dst_y+3)*y_scale;

		next_interpl_vu = (unsigned int) next_src_vu;
		next_interpl_y_upper = (unsigned int) next_src_y_upper;
		next_interpl_y_lower = (unsigned int) next_src_y_lower;

		// calculate weight NORTH-SOUTH
		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );

		// get next lines
		next_src_idx = curr_src_idx^1;
		next_dst_idx = curr_dst_idx^1;

		// 4 lines y
		mfc_get( y_plane[next_src_idx],
				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
				src_dbl_linestride_y,
				RETR_BUF+next_src_idx,
				0, 0 );
		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
				src_dbl_linestride_y,
				RETR_BUF+next_src_idx,
				0, 0 );

		// 2 lines v
		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
		mfc_get( v_plane[next_src_idx],
				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
				src_dbl_linestride_vu+(next_lsoff_v<<1),
				RETR_BUF+next_src_idx,
				0, 0 );
		// 2 lines u
		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
		mfc_get( u_plane[next_src_idx],
				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
				src_dbl_linestride_vu+(next_lsoff_v<<1),
				RETR_BUF+next_src_idx,
				0, 0 );

		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

		// scaling
		// work line y_upper
		bilinear_scale_line_w16( y_plane[curr_src_idx],
				scaled_y_plane[curr_src_idx],
				dst_width,
				vf_x_scale,
				vf_curr_NSweight_y_upper,
				src_linestride_y );
		// work line y_lower
		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
				dst_width,
				vf_x_scale,
				vf_curr_NSweight_y_lower,
				src_linestride_y );
		// work line v
		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
				scaled_v_plane[curr_src_idx],
				dst_width>>1,
				vf_x_scale,
				vf_curr_NSweight_vu,
				src_linestride_vu );
		// work line u
		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
				scaled_u_plane[curr_src_idx],
				dst_width>>1,
				vf_x_scale,
				vf_curr_NSweight_vu,
				src_linestride_vu );


		// Store the result back to main memory into a destination buffer in YUV format
		//---------------------------------------------------------------------------------------------
		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

		// Perform three DMA transfers to 3 different locations in the main memory!
		// dst_width:	Pixel width of destination image
		// dst_addr:	Destination address in main memory
		// dst_vu:	Counter which is incremented one by one
		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
		mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
				(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
				dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,						// Tag
				0, 0 );

		mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
				(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
				dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,						// Tag
				0, 0 );

		mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
				(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
				dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,						// Tag
				0, 0 );
		//---------------------------------------------------------------------------------------------


		// update for next cycle
		curr_src_idx = next_src_idx;
		curr_dst_idx = next_dst_idx;

		curr_interpl_y_upper = next_interpl_y_upper;
		curr_interpl_y_lower = next_interpl_y_lower;
		curr_interpl_vu = next_interpl_vu;

		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
		vf_curr_NSweight_vu = vf_next_NSweight_vu;

		curr_src_y_upper = next_src_y_upper;
		curr_src_y_lower = next_src_y_lower;
		curr_src_vu = next_src_vu;

		curr_lsoff_v = next_lsoff_v;
		curr_lsoff_u = next_lsoff_u;
	}



	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

	// scaling
	// work line y_upper
	bilinear_scale_line_w16( y_plane[curr_src_idx],
			scaled_y_plane[curr_src_idx],
			dst_width,
			vf_x_scale,
			vf_curr_NSweight_y_upper,
			src_linestride_y );
	// work line y_lower
	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
			dst_width,
			vf_x_scale,
			vf_curr_NSweight_y_lower,
			src_linestride_y );
	// work line v
	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
			scaled_v_plane[curr_src_idx],
			dst_width>>1,
			vf_x_scale,
			vf_curr_NSweight_vu,
			src_linestride_vu );
	// work line u
	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
			scaled_u_plane[curr_src_idx],
			dst_width>>1,
			vf_x_scale,
			vf_curr_NSweight_vu,
			src_linestride_vu );


	// Store the result back to main memory into a destination buffer in YUV format
	//---------------------------------------------------------------------------------------------
	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

	// Perform three DMA transfers to 3 different locations in the main memory!
	// dst_width:	Pixel width of destination image
	// dst_addr:	Destination address in main memory
	// dst_vu:	Counter which is incremented one by one
	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)
	mfc_put(	scaled_y_plane[curr_src_idx],					// What from local store (addr)
			(unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
			dst_dbl_linestride_y,						// Two Y lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,						// Tag
			0, 0 );

	mfc_put(	scaled_v_plane[curr_src_idx],					// What from local store (addr)
			(unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
			dst_dbl_linestride_vu,						// Two V lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,						// Tag
			0, 0 );

	mfc_put(	scaled_u_plane[curr_src_idx],					// What from local store (addr)
			(unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
			dst_dbl_linestride_vu,						// Two U lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,						// Tag
			0, 0 );

	// wait for completion
	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
	//---------------------------------------------------------------------------------------------
}


/*
 * scale_srcw16_dstw32()
 *
 * processes an input image of width 16
 * scaling is done to a width 32
 * yuv2rgb conversion on a width of 32
 * result stored in RAM
 */
void scale_srcw16_dstw32() {
	// extract parameters
	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;

	unsigned int src_width = parms.src_pixel_width;
	unsigned int src_height = parms.src_pixel_height;
	unsigned int dst_width = parms.dst_pixel_width;
	unsigned int dst_height = parms.dst_pixel_height;

	// YVU
	unsigned int src_linestride_y = src_width;
	unsigned int src_dbl_linestride_y = src_width<<1;
	unsigned int src_linestride_vu = src_width>>1;
	unsigned int src_dbl_linestride_vu = src_width;
	// scaled YVU
	unsigned int scaled_src_linestride_y = dst_width;

	// ram addresses
	unsigned char* src_addr_y = parms.y_plane;
	unsigned char* src_addr_v = parms.v_plane;
	unsigned char* src_addr_u = parms.u_plane;

	unsigned int dst_picture_size = dst_width*dst_height;

	// Sizes for destination
	unsigned int dst_dbl_linestride_y = dst_width<<1;
	unsigned int dst_dbl_linestride_vu = dst_width>>1;

	// Perform address calculation for Y, V and U in main memory with dst_addr as base
	unsigned char* dst_addr_main_memory_y = dst_addr;
	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);


	// for handling misalignment, addresses are precalculated
	unsigned char* precalc_src_addr_v = src_addr_v;
	unsigned char* precalc_src_addr_u = src_addr_u;

	// calculate scale factors
	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
	float y_scale = (float)src_height/(float)dst_height;

	// double buffered processing
	// buffer switching
	unsigned int curr_src_idx = 0;
	unsigned int curr_dst_idx = 0;
	unsigned int next_src_idx, next_dst_idx;

	// 2 lines y as output, upper and lowerline
	unsigned int curr_interpl_y_upper = 0;
	unsigned int next_interpl_y_upper;
	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
	// only 1 line v/u output, both planes have the same dimension
	unsigned int curr_interpl_vu = 0;
	unsigned int next_interpl_vu;

	// weights, calculated in every loop iteration
	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
	vector float vf_next_NSweight_y_upper;
	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
	vector float vf_next_NSweight_vu;

	// line indices for the src picture
	float curr_src_y_upper = 0.0f, next_src_y_upper;
	float curr_src_y_lower, next_src_y_lower;
	float curr_src_vu = 0.0f, next_src_vu;

	// line indices for the dst picture
	unsigned int dst_y=0, dst_vu=0;

	// offset for the v and u plane to handle misalignement
	unsigned int curr_lsoff_v = 0, next_lsoff_v;
	unsigned int curr_lsoff_u = 0, next_lsoff_u;

	// calculate lower line idices
	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
	// lower line weight
	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );


	// start partially double buffered processing
	// get initial data, 2 sets of y, 1 set v, 1 set u
	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
			src_dbl_linestride_y,
			RETR_BUF,
			0, 0 );
	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );

	// iteration loop
	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
	// the scaled output is 2 lines y, 1 line v, 1 line u
	// the yuv2rgb-converted output is stored to RAM
	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
		dst_y = dst_vu<<1;

		// calculate next indices
		next_src_vu = ((float)dst_vu+1)*y_scale;
		next_src_y_upper = ((float)dst_y+2)*y_scale;
		next_src_y_lower = ((float)dst_y+3)*y_scale;

		next_interpl_vu = (unsigned int) next_src_vu;
		next_interpl_y_upper = (unsigned int) next_src_y_upper;
		next_interpl_y_lower = (unsigned int) next_src_y_lower;

		// calculate weight NORTH-SOUTH
		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );

		// get next lines
		next_src_idx = curr_src_idx^1;
		next_dst_idx = curr_dst_idx^1;

		// 4 lines y
		mfc_get( y_plane[next_src_idx],
				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
				src_dbl_linestride_y,
				RETR_BUF+next_src_idx,
				0, 0 );
		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
				src_dbl_linestride_y,
				RETR_BUF+next_src_idx,
				0, 0 );

		// 2 lines v
		precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu);
		next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F;
		mfc_get( v_plane[next_src_idx],
				((unsigned int) precalc_src_addr_v)&0xFFFFFFF0,
				src_dbl_linestride_vu+(next_lsoff_v<<1),
				RETR_BUF+next_src_idx,
				0, 0 );
		// 2 lines u
		precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu);
		next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F;
		mfc_get( u_plane[next_src_idx],
				((unsigned int) precalc_src_addr_u)&0xFFFFFFF0,
				src_dbl_linestride_vu+(next_lsoff_v<<1),
				RETR_BUF+next_src_idx,
				0, 0 );

		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

		// scaling
		// work line y_upper
		bilinear_scale_line_w16( y_plane[curr_src_idx],
				scaled_y_plane[curr_src_idx],
				dst_width,
				vf_x_scale,
				vf_curr_NSweight_y_upper,
				src_linestride_y );
		// work line y_lower
		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
				dst_width,
				vf_x_scale,
				vf_curr_NSweight_y_lower,
				src_linestride_y );
		// work line v
		bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
				scaled_v_plane[curr_src_idx],
				dst_width>>1,
				vf_x_scale,
				vf_curr_NSweight_vu,
				src_linestride_vu );
		// work line u
		bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
				scaled_u_plane[curr_src_idx],
				dst_width>>1,
				vf_x_scale,
				vf_curr_NSweight_vu,
				src_linestride_vu );

		//---------------------------------------------------------------------------------------------
		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

		// Perform three DMA transfers to 3 different locations in the main memory!
		// dst_width:	Pixel width of destination image
		// dst_addr:	Destination address in main memory
		// dst_vu:	Counter which is incremented one by one
		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,								// Tag
				0, 0 );

		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,								// Tag
				0, 0 );

		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,								// Tag
				0, 0 );
		//---------------------------------------------------------------------------------------------


		// update for next cycle
		curr_src_idx = next_src_idx;
		curr_dst_idx = next_dst_idx;

		curr_interpl_y_upper = next_interpl_y_upper;
		curr_interpl_y_lower = next_interpl_y_lower;
		curr_interpl_vu = next_interpl_vu;

		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
		vf_curr_NSweight_vu = vf_next_NSweight_vu;

		curr_src_y_upper = next_src_y_upper;
		curr_src_y_lower = next_src_y_lower;
		curr_src_vu = next_src_vu;

		curr_lsoff_v = next_lsoff_v;
		curr_lsoff_u = next_lsoff_u;
	}



	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

	// scaling
	// work line y_upper
	bilinear_scale_line_w16( y_plane[curr_src_idx],
			scaled_y_plane[curr_src_idx],
			dst_width,
			vf_x_scale,
			vf_curr_NSweight_y_upper,
			src_linestride_y );
	// work line y_lower
	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
			dst_width,
			vf_x_scale,
			vf_curr_NSweight_y_lower,
			src_linestride_y );
	// work line v
	bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v,
			scaled_v_plane[curr_src_idx],
			dst_width>>1,
			vf_x_scale,
			vf_curr_NSweight_vu,
			src_linestride_vu );
	// work line u
	bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u,
			scaled_u_plane[curr_src_idx],
			dst_width>>1,
			vf_x_scale,
			vf_curr_NSweight_vu,
			src_linestride_vu );

	//---------------------------------------------------------------------------------------------
	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

	// Perform three DMA transfers to 3 different locations in the main memory!
	// dst_width:	Pixel width of destination image
	// dst_addr:	Destination address in main memory
	// dst_vu:	Counter which is incremented one by one
	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,								// Tag
			0, 0 );

	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,								// Tag
			0, 0 );

	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,								// Tag
			0, 0 );

	// wait for completion
	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
	//---------------------------------------------------------------------------------------------
}


/*
 * scale_srcw32_dstw16()
 *
 * processes an input image of width 32
 * scaling is done to a width 16
 * yuv2rgb conversion on a width of 16
 * result stored in RAM
 */
void scale_srcw32_dstw16() {
	// extract parameters
	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;

	unsigned int src_width = parms.src_pixel_width;
	unsigned int src_height = parms.src_pixel_height;
	unsigned int dst_width = parms.dst_pixel_width;
	unsigned int dst_height = parms.dst_pixel_height;

	// YVU
	unsigned int src_linestride_y = src_width;
	unsigned int src_dbl_linestride_y = src_width<<1;
	unsigned int src_linestride_vu = src_width>>1;
	unsigned int src_dbl_linestride_vu = src_width;
	// scaled YVU
	unsigned int scaled_src_linestride_y = dst_width;

	// ram addresses
	unsigned char* src_addr_y = parms.y_plane;
	unsigned char* src_addr_v = parms.v_plane;
	unsigned char* src_addr_u = parms.u_plane;

	unsigned int dst_picture_size = dst_width*dst_height;

	// Sizes for destination
	unsigned int dst_dbl_linestride_y = dst_width<<1;
	unsigned int dst_dbl_linestride_vu = dst_width>>1;

	// Perform address calculation for Y, V and U in main memory with dst_addr as base
	unsigned char* dst_addr_main_memory_y = dst_addr;
	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);

	// calculate scale factors
	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
	float y_scale = (float)src_height/(float)dst_height;

	// double buffered processing
	// buffer switching
	unsigned int curr_src_idx = 0;
	unsigned int curr_dst_idx = 0;
	unsigned int next_src_idx, next_dst_idx;

	// 2 lines y as output, upper and lowerline
	unsigned int curr_interpl_y_upper = 0;
	unsigned int next_interpl_y_upper;
	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
	// only 1 line v/u output, both planes have the same dimension
	unsigned int curr_interpl_vu = 0;
	unsigned int next_interpl_vu;

	// weights, calculated in every loop iteration
	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
	vector float vf_next_NSweight_y_upper;
	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
	vector float vf_next_NSweight_vu;

	// line indices for the src picture
	float curr_src_y_upper = 0.0f, next_src_y_upper;
	float curr_src_y_lower, next_src_y_lower;
	float curr_src_vu = 0.0f, next_src_vu;

	// line indices for the dst picture
	unsigned int dst_y=0, dst_vu=0;

	// calculate lower line idices
	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
	// lower line weight
	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );


	// start partially double buffered processing
	// get initial data, 2 sets of y, 1 set v, 1 set u
	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
			src_dbl_linestride_y,
			RETR_BUF,
			0, 0 );
	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );

	// iteration loop
	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
	// the scaled output is 2 lines y, 1 line v, 1 line u
	// the yuv2rgb-converted output is stored to RAM
	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
		dst_y = dst_vu<<1;

		// calculate next indices
		next_src_vu = ((float)dst_vu+1)*y_scale;
		next_src_y_upper = ((float)dst_y+2)*y_scale;
		next_src_y_lower = ((float)dst_y+3)*y_scale;

		next_interpl_vu = (unsigned int) next_src_vu;
		next_interpl_y_upper = (unsigned int) next_src_y_upper;
		next_interpl_y_lower = (unsigned int) next_src_y_lower;

		// calculate weight NORTH-SOUTH
		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );

		// get next lines
		next_src_idx = curr_src_idx^1;
		next_dst_idx = curr_dst_idx^1;

		// 4 lines y
		mfc_get( y_plane[next_src_idx],
				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
				src_dbl_linestride_y,
				RETR_BUF+next_src_idx,
				0, 0 );
		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
				src_dbl_linestride_y,
				RETR_BUF+next_src_idx,
				0, 0 );

		// 2 lines v
		mfc_get( v_plane[next_src_idx],
				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
				src_dbl_linestride_vu,
				RETR_BUF+next_src_idx,
				0, 0 );
		// 2 lines u
		mfc_get( u_plane[next_src_idx],
				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
				src_dbl_linestride_vu,
				RETR_BUF+next_src_idx,
				0, 0 );

		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

		// scaling
		// work line y_upper
		bilinear_scale_line_w16( y_plane[curr_src_idx],
				scaled_y_plane[curr_src_idx],
				dst_width,
				vf_x_scale,
				vf_curr_NSweight_y_upper,
				src_linestride_y );
		// work line y_lower
		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
				dst_width,
				vf_x_scale,
				vf_curr_NSweight_y_lower,
				src_linestride_y );
		// work line v
		bilinear_scale_line_w16( v_plane[curr_src_idx],
				scaled_v_plane[curr_src_idx],
				dst_width>>1,
				vf_x_scale,
				vf_curr_NSweight_vu,
				src_linestride_vu );
		// work line u
		bilinear_scale_line_w16( u_plane[curr_src_idx],
				scaled_u_plane[curr_src_idx],
				dst_width>>1,
				vf_x_scale,
				vf_curr_NSweight_vu,
				src_linestride_vu );

		//---------------------------------------------------------------------------------------------
		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

		// Perform three DMA transfers to 3 different locations in the main memory!
		// dst_width:	Pixel width of destination image
		// dst_addr:	Destination address in main memory
		// dst_vu:	Counter which is incremented one by one
		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
				(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,								// Tag
				0, 0 );

		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,								// Tag
				0, 0 );

		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
				(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,								// Tag
				0, 0 );
		//---------------------------------------------------------------------------------------------


		// update for next cycle
		curr_src_idx = next_src_idx;
		curr_dst_idx = next_dst_idx;

		curr_interpl_y_upper = next_interpl_y_upper;
		curr_interpl_y_lower = next_interpl_y_lower;
		curr_interpl_vu = next_interpl_vu;

		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
		vf_curr_NSweight_vu = vf_next_NSweight_vu;

		curr_src_y_upper = next_src_y_upper;
		curr_src_y_lower = next_src_y_lower;
		curr_src_vu = next_src_vu;
	}



	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

	// scaling
	// work line y_upper
	bilinear_scale_line_w16( y_plane[curr_src_idx],
			scaled_y_plane[curr_src_idx],
			dst_width,
			vf_x_scale,
			vf_curr_NSweight_y_upper,
			src_linestride_y );
	// work line y_lower
	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
			dst_width,
			vf_x_scale,
			vf_curr_NSweight_y_lower,
			src_linestride_y );
	// work line v
	bilinear_scale_line_w16( v_plane[curr_src_idx],
			scaled_v_plane[curr_src_idx],
			dst_width>>1,
			vf_x_scale,
			vf_curr_NSweight_vu,
			src_linestride_vu );
	// work line u
	bilinear_scale_line_w16( u_plane[curr_src_idx],
			scaled_u_plane[curr_src_idx],
			dst_width>>1,
			vf_x_scale,
			vf_curr_NSweight_vu,
			src_linestride_vu );


	//---------------------------------------------------------------------------------------------
	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

	// Perform three DMA transfers to 3 different locations in the main memory!
	// dst_width:	Pixel width of destination image
	// dst_addr:	Destination address in main memory
	// dst_vu:	Counter which is incremented one by one
	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,								// Tag
			0, 0 );

	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,								// Tag
			0, 0 );

	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,								// Tag
			0, 0 );

	// wait for completion
	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
	//---------------------------------------------------------------------------------------------
}


/**
 * scale_srcw32_dstw32()
 *
 * processes an input image of width 32
 * scaling is done to a width 32
 * yuv2rgb conversion on a width of 32
 * result stored in RAM
 */
void scale_srcw32_dstw32() {
	// extract parameters
	unsigned char* dst_addr = (unsigned char *)parms.dstBuffer;

	unsigned int src_width = parms.src_pixel_width;
	unsigned int src_height = parms.src_pixel_height;
	unsigned int dst_width = parms.dst_pixel_width;
	unsigned int dst_height = parms.dst_pixel_height;

	// YVU
	unsigned int src_linestride_y = src_width;
	unsigned int src_dbl_linestride_y = src_width<<1;
	unsigned int src_linestride_vu = src_width>>1;
	unsigned int src_dbl_linestride_vu = src_width;

	// scaled YVU
	unsigned int scaled_src_linestride_y = dst_width;

	// ram addresses
	unsigned char* src_addr_y = parms.y_plane;
	unsigned char* src_addr_v = parms.v_plane;
	unsigned char* src_addr_u = parms.u_plane;

	unsigned int dst_picture_size = dst_width*dst_height;

	// Sizes for destination
	unsigned int dst_dbl_linestride_y = dst_width<<1;
	unsigned int dst_dbl_linestride_vu = dst_width>>1;

	// Perform address calculation for Y, V and U in main memory with dst_addr as base
	unsigned char* dst_addr_main_memory_y = dst_addr;
	unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size;
	unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2);

	// calculate scale factors
	vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width );
	float y_scale = (float)src_height/(float)dst_height;

	// double buffered processing
	// buffer switching
	unsigned int curr_src_idx = 0;
	unsigned int curr_dst_idx = 0;
	unsigned int next_src_idx, next_dst_idx;

	// 2 lines y as output, upper and lowerline
	unsigned int curr_interpl_y_upper = 0;
	unsigned int next_interpl_y_upper;
	unsigned int curr_interpl_y_lower, next_interpl_y_lower;
	// only 1 line v/u output, both planes have the same dimension
	unsigned int curr_interpl_vu = 0;
	unsigned int next_interpl_vu;

	// weights, calculated in every loop iteration
	vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f };
	vector float vf_next_NSweight_y_upper;
	vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower;
	vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f };
	vector float vf_next_NSweight_vu;

	// line indices for the src picture
	float curr_src_y_upper = 0.0f, next_src_y_upper;
	float curr_src_y_lower, next_src_y_lower;
	float curr_src_vu = 0.0f, next_src_vu;

	// line indices for the dst picture
	unsigned int dst_y=0, dst_vu=0;

	// calculate lower line idices
	curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale;
	curr_interpl_y_lower = (unsigned int)curr_src_y_lower;
	// lower line weight
	vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower );


	// start partially double buffered processing
	// get initial data, 2 sets of y, 1 set v, 1 set u
	mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 );
	mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y,
			(unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y),
			src_dbl_linestride_y,
			RETR_BUF,
			0, 0 );
	mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 );
	mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 );

	// iteration loop
	// within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved
	// the scaled output is 2 lines y, 1 line v, 1 line u
	// the yuv2rgb-converted output is stored to RAM
	for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) {
		dst_y = dst_vu<<1;

		// calculate next indices
		next_src_vu = ((float)dst_vu+1)*y_scale;
		next_src_y_upper = ((float)dst_y+2)*y_scale;
		next_src_y_lower = ((float)dst_y+3)*y_scale;

		next_interpl_vu = (unsigned int) next_src_vu;
		next_interpl_y_upper = (unsigned int) next_src_y_upper;
		next_interpl_y_lower = (unsigned int) next_src_y_lower;

		// calculate weight NORTH-SOUTH
		vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu );
		vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper );
		vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower );

		// get next lines
		next_src_idx = curr_src_idx^1;
		next_dst_idx = curr_dst_idx^1;

		// 4 lines y
		mfc_get( y_plane[next_src_idx],
				(unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y),
				src_dbl_linestride_y,
				RETR_BUF+next_src_idx,
				0, 0 );
		mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y,
				(unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y),
				src_dbl_linestride_y,
				RETR_BUF+next_src_idx,
				0, 0 );

		// 2 lines v
		mfc_get( v_plane[next_src_idx],
				(unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu),
				src_dbl_linestride_vu,
				RETR_BUF+next_src_idx,
				0, 0 );
		// 2 lines u
		mfc_get( u_plane[next_src_idx],
				(unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu),
				src_dbl_linestride_vu,
				RETR_BUF+next_src_idx,
				0, 0 );

		DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

		// scaling
		// work line y_upper
		bilinear_scale_line_w16( y_plane[curr_src_idx],
				scaled_y_plane[curr_src_idx],
				dst_width,
				vf_x_scale,
				vf_curr_NSweight_y_upper,
				src_linestride_y );
		// work line y_lower
		bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
				scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
				dst_width,
				vf_x_scale,
				vf_curr_NSweight_y_lower,
				src_linestride_y );
		// work line v
		bilinear_scale_line_w16( v_plane[curr_src_idx],
				scaled_v_plane[curr_src_idx],
				dst_width>>1,
				vf_x_scale,
				vf_curr_NSweight_vu,
				src_linestride_vu );
		// work line u
		bilinear_scale_line_w16( u_plane[curr_src_idx],
				scaled_u_plane[curr_src_idx],
				dst_width>>1,
				vf_x_scale,
				vf_curr_NSweight_vu,
				src_linestride_vu );



		// Store the result back to main memory into a destination buffer in YUV format
		//---------------------------------------------------------------------------------------------
		DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

		// Perform three DMA transfers to 3 different locations in the main memory!
		// dst_width:	Pixel width of destination image
		// dst_addr:	Destination address in main memory
		// dst_vu:	Counter which is incremented one by one
		// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

		mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
				(unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
				dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,								// Tag
				0, 0 );

		mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
				(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
				dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,								// Tag
				0, 0 );

		mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
				(unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
				dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
				STR_BUF+curr_dst_idx,								// Tag
				0, 0 );
		//---------------------------------------------------------------------------------------------


		// update for next cycle
		curr_src_idx = next_src_idx;
		curr_dst_idx = next_dst_idx;

		curr_interpl_y_upper = next_interpl_y_upper;
		curr_interpl_y_lower = next_interpl_y_lower;
		curr_interpl_vu = next_interpl_vu;

		vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper;
		vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower;
		vf_curr_NSweight_vu = vf_next_NSweight_vu;

		curr_src_y_upper = next_src_y_upper;
		curr_src_y_lower = next_src_y_lower;
		curr_src_vu = next_src_vu;
	}



	DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) );

	// scaling
	// work line y_upper
	bilinear_scale_line_w16( y_plane[curr_src_idx],
			scaled_y_plane[curr_src_idx],
			dst_width,
			vf_x_scale,
			vf_curr_NSweight_y_upper,
			src_linestride_y );
	// work line y_lower
	bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y,
			scaled_y_plane[curr_src_idx]+scaled_src_linestride_y,
			dst_width,
			vf_x_scale,
			vf_curr_NSweight_y_lower,
			src_linestride_y );
	// work line v
	bilinear_scale_line_w16( v_plane[curr_src_idx],
			scaled_v_plane[curr_src_idx],
			dst_width>>1,
			vf_x_scale,
			vf_curr_NSweight_vu,
			src_linestride_vu );
	// work line u
	bilinear_scale_line_w16( u_plane[curr_src_idx],
			scaled_u_plane[curr_src_idx],
			dst_width>>1,
			vf_x_scale,
			vf_curr_NSweight_vu,
			src_linestride_vu );


	// Store the result back to main memory into a destination buffer in YUV format
	//---------------------------------------------------------------------------------------------
	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );

	// Perform three DMA transfers to 3 different locations in the main memory!
	// dst_width:	Pixel width of destination image
	// dst_addr:	Destination address in main memory
	// dst_vu:	Counter which is incremented one by one
	// dst_y:	Counter which is twice larger than dst_vu (dst_y = 2*dst_vu)

	mfc_put(	scaled_y_plane[curr_src_idx],							// What from local store (addr)
			(unsigned int)  dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y),	// Destination in main memory (addr)
			dst_dbl_linestride_y,								// Two Y lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,								// Tag
			0, 0 );

	mfc_put(	scaled_v_plane[curr_src_idx],							// What from local store (addr)
			(unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu),	// Destination in main memory (addr)
			dst_dbl_linestride_vu,								// Two V lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,								// Tag
			0, 0 );

	mfc_put(	scaled_u_plane[curr_src_idx],							// What from local store (addr)
			(unsigned int)  dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr)
			dst_dbl_linestride_vu,								// Two U lines (depending on the widht of the destination resolution)
			STR_BUF+curr_dst_idx,								// Tag
			0, 0 );

	// wait for completion
	DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) );
	//---------------------------------------------------------------------------------------------
}


/*
 * bilinear_scale_line_w8()
 *
 * processes a line of yuv-input, width has to be a multiple of 8
 * scaled yuv-output is written to local store buffer
 *
 * @param src buffer for 2 lines input
 * @param dst_ buffer for 1 line output
 * @param dst_width the width of the destination line
 * @param vf_x_scale a float vector, at each entry is the x_scale-factor
 * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
 * @param src_linestride the stride of the srcline
 */
void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {

	unsigned char* dst = dst_;

	unsigned int dst_x;
	for( dst_x=0; dst_x<dst_width; dst_x+=8) {
		// address calculation for loading the 4 surrounding pixel of each calculated
		// destination pixel
		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
		// lower range->first 4 pixel
		// upper range->next 4 pixel
		vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 };
		vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 };
		vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range );
		vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range );

		// calculate weight EAST-WEST
		vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 );
		vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 );
		vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale );
		vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale );
		vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 );
		vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 );
		vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 );
		vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 );
		vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range );
		vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range );

		// calculate address offset
		//
		// pixel NORTH WEST
		vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range;
		vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range;

		// pixel NORTH EAST-->(offpixelNW+1)
		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
		vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 );
		vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 );

		// SOUTH-WEST-->(offpixelNW+src_linestride)
		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
		vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range );
		vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range );

		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
		vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range );
		vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range );

		// calculate each address
		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
		vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range );
		vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range );
		vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range );
		vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range );

		vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range );
		vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range );
		vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range );
		vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range );

		// get each pixel
		//
		// scalar load, afterwards insertion into the right position
		// NORTH WEST
		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
		vector unsigned char vuc_pixel_NW_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 );
		vuc_pixel_NW_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )),
				vuc_pixel_NW_lower_range, 7 );
		vuc_pixel_NW_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )),
				vuc_pixel_NW_lower_range, 11 );
		vuc_pixel_NW_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )),
				vuc_pixel_NW_lower_range, 15 );

		vector unsigned char vuc_pixel_NW_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 );
		vuc_pixel_NW_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )),
				vuc_pixel_NW_upper_range, 7 );
		vuc_pixel_NW_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )),
				vuc_pixel_NW_upper_range, 11 );
		vuc_pixel_NW_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )),
				vuc_pixel_NW_upper_range, 15 );

		// NORTH EAST
		vector unsigned char vuc_pixel_NE_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 );
		vuc_pixel_NE_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )),
				vuc_pixel_NE_lower_range, 7 );
		vuc_pixel_NE_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )),
				vuc_pixel_NE_lower_range, 11 );
		vuc_pixel_NE_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )),
				vuc_pixel_NE_lower_range, 15 );

		vector unsigned char vuc_pixel_NE_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 );
		vuc_pixel_NE_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )),
				vuc_pixel_NE_upper_range, 7 );
		vuc_pixel_NE_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )),
				vuc_pixel_NE_upper_range, 11 );
		vuc_pixel_NE_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )),
				vuc_pixel_NE_upper_range, 15 );


		// SOUTH WEST
		vector unsigned char vuc_pixel_SW_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 );
		vuc_pixel_SW_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )),
				vuc_pixel_SW_lower_range, 7 );
		vuc_pixel_SW_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )),
				vuc_pixel_SW_lower_range, 11 );
		vuc_pixel_SW_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )),
				vuc_pixel_SW_lower_range, 15 );

		vector unsigned char vuc_pixel_SW_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 );
		vuc_pixel_SW_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )),
				vuc_pixel_SW_upper_range, 7 );
		vuc_pixel_SW_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )),
				vuc_pixel_SW_upper_range, 11 );
		vuc_pixel_SW_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )),
				vuc_pixel_SW_upper_range, 15 );

		// SOUTH EAST
		vector unsigned char vuc_pixel_SE_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 );
		vuc_pixel_SE_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )),
				vuc_pixel_SE_lower_range, 7 );
		vuc_pixel_SE_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )),
				vuc_pixel_SE_lower_range, 11 );
		vuc_pixel_SE_lower_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )),
				vuc_pixel_SE_lower_range, 15 );

		vector unsigned char vuc_pixel_SE_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 );
		vuc_pixel_SE_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )),
				vuc_pixel_SE_upper_range, 7 );
		vuc_pixel_SE_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )),
				vuc_pixel_SE_upper_range, 11 );
		vuc_pixel_SE_upper_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )),
				vuc_pixel_SE_upper_range, 15 );


		// convert to float
		vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 );
		vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 );

		vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 );
		vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 );

		vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 );
		vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 );

		vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 );
		vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 );



		// first linear interpolation: EWtop
		// EWtop = NW + EWweight*(NE-NW)
		//
		// lower range
		vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range );
		vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range,
								vf_EWtop_lower_range_tmp,
								vf_pixel_NW_lower_range );

		// upper range
		vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range );
		vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range,
								vf_EWtop_upper_range_tmp,
								vf_pixel_NW_upper_range );



		// second linear interpolation: EWbottom
		// EWbottom = SW + EWweight*(SE-SW)
		//
		// lower range
		vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range );
		vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range,
								vf_EWbottom_lower_range_tmp,
								vf_pixel_SW_lower_range );

		// upper range
		vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range );
		vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range,
								vf_EWbottom_upper_range_tmp,
								vf_pixel_SW_upper_range );



		// third linear interpolation: the bilinear interpolated value
		// result = EWtop + NSweight*(EWbottom-EWtop);
		//
		// lower range
		vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range );
		vector float vf_result_lower_range = spu_madd( vf_NSweight,
								vf_result_lower_range_tmp,
								vf_EWtop_lower_range );

		// upper range
		vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range );
		vector float vf_result_upper_range = spu_madd( vf_NSweight,
								vf_result_upper_range_tmp,
								vf_EWtop_upper_range );


		// convert back: using saturated arithmetic
		vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range );
		vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range );

		// merge results->lower,upper
		vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F,
							       0x13, 0x17, 0x1B, 0x1F,
							       0x00, 0x00, 0x00, 0x00,
							       0x00, 0x00, 0x00, 0x00 };

		vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range,
								(vector unsigned char) vui_result_upper_range,
								vuc_mask_merge_result );

		// partial storing
		vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00,
						      0x00, 0x00, 0x00, 0x00,
						      0xFF, 0xFF, 0xFF, 0xFF,
						      0xFF, 0xFF, 0xFF, 0xFF };


		// get currently stored data
		vector unsigned char vuc_orig = *((vector unsigned char*)dst);

		// clear currently stored data
		vuc_orig = spu_and( vuc_orig,
				spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) );

		// rotate result according to storing address
		vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F );

		// store result
		*((vector unsigned char*)dst) = spu_or( vuc_result,
							vuc_orig );
		dst += 8;
	}
}


/*
 * bilinear_scale_line_w16()
 *
 * processes a line of yuv-input, width has to be a multiple of 16
 * scaled yuv-output is written to local store buffer
 *
 * @param src buffer for 2 lines input
 * @param dst_ buffer for 1 line output
 * @param dst_width the width of the destination line
 * @param vf_x_scale a float vector, at each entry is the x_scale-factor
 * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line
 * @param src_linestride the stride of the srcline
 */
void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) {

	unsigned char* dst = dst_;

	unsigned int dst_x;
	for( dst_x=0; dst_x<dst_width; dst_x+=16) {
		// address calculation for loading the 4 surrounding pixel of each calculated
		// destination pixel
		vector unsigned int vui_dst_x_tmp = spu_splats( dst_x );
		// parallelised processing
		// first range->pixel 1 2 3 4
		// second range->pixel 5 6 7 8
		// third range->pixel 9 10 11 12
		// fourth range->pixel 13 14 15 16
		vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 };
		vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 };
		vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 };
		vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 };
		vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range );
		vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range );
		vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range );
		vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range );

		// calculate weight EAST-WEST
		vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 );
		vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 );
		vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 );
		vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 );
		vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale );
		vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale );
		vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale );
		vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale );
		vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 );
		vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 );
		vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 );
		vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 );
		vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 );
		vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 );
		vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 );
		vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 );
		vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range );
		vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range );
		vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range );
		vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range );

		// calculate address offset
		//
		// pixel NORTH WEST
		vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range;
		vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range;
		vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range;
		vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range;

		// pixel NORTH EAST-->(offpixelNW+1)
		vector unsigned int vui_add_1 = { 1, 1, 1, 1 };
		vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 );
		vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 );
		vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 );
		vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 );

		// SOUTH-WEST-->(offpixelNW+src_linestride)
		vector unsigned int vui_srclinestride = spu_splats( src_linestride );
		vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range );
		vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range );
		vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range );
		vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range );

		// SOUTH-EAST-->(offpixelNW+src_linestride+1)
		vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range );
		vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range );
		vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range );
		vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range );

		// calculate each address
		vector unsigned int vui_src_ls = spu_splats( (unsigned int) src );
		vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range );
		vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range );
		vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range );
		vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range );

		vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range );
		vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range );
		vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range );
		vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range );

		vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range );
		vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range );
		vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range );
		vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range );

		vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range );
		vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range );
		vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range );
		vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range );


		// get each pixel
		//
		// scalar load, afterwards insertion into the right position
		// NORTH WEST
		// first range
		vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
		vector unsigned char vuc_pixel_NW_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 );
		vuc_pixel_NW_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )),
				vuc_pixel_NW_first_range, 7 );
		vuc_pixel_NW_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )),
				vuc_pixel_NW_first_range, 11 );
		vuc_pixel_NW_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )),
				vuc_pixel_NW_first_range, 15 );
		// second range
		vector unsigned char vuc_pixel_NW_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 );
		vuc_pixel_NW_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )),
				vuc_pixel_NW_second_range, 7 );
		vuc_pixel_NW_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )),
				vuc_pixel_NW_second_range, 11 );
		vuc_pixel_NW_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )),
				vuc_pixel_NW_second_range, 15 );
		// third range
		vector unsigned char vuc_pixel_NW_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 );
		vuc_pixel_NW_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )),
				vuc_pixel_NW_third_range, 7 );
		vuc_pixel_NW_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )),
				vuc_pixel_NW_third_range, 11 );
		vuc_pixel_NW_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )),
				vuc_pixel_NW_third_range, 15 );
		// fourth range
		vector unsigned char vuc_pixel_NW_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 );
		vuc_pixel_NW_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )),
				vuc_pixel_NW_fourth_range, 7 );
		vuc_pixel_NW_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )),
				vuc_pixel_NW_fourth_range, 11 );
		vuc_pixel_NW_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )),
				vuc_pixel_NW_fourth_range, 15 );

		// NORTH EAST
		// first range
		vector unsigned char vuc_pixel_NE_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 );
		vuc_pixel_NE_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )),
				vuc_pixel_NE_first_range, 7 );
		vuc_pixel_NE_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )),
				vuc_pixel_NE_first_range, 11 );
		vuc_pixel_NE_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )),
				vuc_pixel_NE_first_range, 15 );
		// second range
		vector unsigned char vuc_pixel_NE_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 );
		vuc_pixel_NE_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )),
				vuc_pixel_NE_second_range, 7 );
		vuc_pixel_NE_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )),
				vuc_pixel_NE_second_range, 11 );
		vuc_pixel_NE_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )),
				vuc_pixel_NE_second_range, 15 );
		// third range
		vector unsigned char vuc_pixel_NE_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 );
		vuc_pixel_NE_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )),
				vuc_pixel_NE_third_range, 7 );
		vuc_pixel_NE_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )),
				vuc_pixel_NE_third_range, 11 );
		vuc_pixel_NE_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )),
				vuc_pixel_NE_third_range, 15 );
		// fourth range
		vector unsigned char vuc_pixel_NE_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 );
		vuc_pixel_NE_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )),
				vuc_pixel_NE_fourth_range, 7 );
		vuc_pixel_NE_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )),
				vuc_pixel_NE_fourth_range, 11 );
		vuc_pixel_NE_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )),
				vuc_pixel_NE_fourth_range, 15 );

		// SOUTH WEST
		// first range
		vector unsigned char vuc_pixel_SW_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 );
		vuc_pixel_SW_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )),
				vuc_pixel_SW_first_range, 7 );
		vuc_pixel_SW_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )),
				vuc_pixel_SW_first_range, 11 );
		vuc_pixel_SW_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )),
				vuc_pixel_SW_first_range, 15 );
		// second range
		vector unsigned char vuc_pixel_SW_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 );
		vuc_pixel_SW_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )),
				vuc_pixel_SW_second_range, 7 );
		vuc_pixel_SW_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )),
				vuc_pixel_SW_second_range, 11 );
		vuc_pixel_SW_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )),
				vuc_pixel_SW_second_range, 15 );
		// third range
		vector unsigned char vuc_pixel_SW_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 );
		vuc_pixel_SW_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )),
				vuc_pixel_SW_third_range, 7 );
		vuc_pixel_SW_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )),
				vuc_pixel_SW_third_range, 11 );
		vuc_pixel_SW_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )),
				vuc_pixel_SW_third_range, 15 );
		// fourth range
		vector unsigned char vuc_pixel_SW_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 );
		vuc_pixel_SW_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )),
				vuc_pixel_SW_fourth_range, 7 );
		vuc_pixel_SW_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )),
				vuc_pixel_SW_fourth_range, 11 );
		vuc_pixel_SW_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )),
				vuc_pixel_SW_fourth_range, 15 );

		// NORTH EAST
		// first range
		vector unsigned char vuc_pixel_SE_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 );
		vuc_pixel_SE_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )),
				vuc_pixel_SE_first_range, 7 );
		vuc_pixel_SE_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )),
				vuc_pixel_SE_first_range, 11 );
		vuc_pixel_SE_first_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )),
				vuc_pixel_SE_first_range, 15 );
		// second range
		vector unsigned char vuc_pixel_SE_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 );
		vuc_pixel_SE_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )),
				vuc_pixel_SE_second_range, 7 );
		vuc_pixel_SE_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )),
				vuc_pixel_SE_second_range, 11 );
		vuc_pixel_SE_second_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )),
				vuc_pixel_SE_second_range, 15 );
		// third range
		vector unsigned char vuc_pixel_SE_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 );
		vuc_pixel_SE_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )),
				vuc_pixel_SE_third_range, 7 );
		vuc_pixel_SE_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )),
				vuc_pixel_SE_third_range, 11 );
		vuc_pixel_SE_third_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )),
				vuc_pixel_SE_third_range, 15 );
		// fourth range
		vector unsigned char vuc_pixel_SE_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 );
		vuc_pixel_SE_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )),
				vuc_pixel_SE_fourth_range, 7 );
		vuc_pixel_SE_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )),
				vuc_pixel_SE_fourth_range, 11 );
		vuc_pixel_SE_fourth_range = spu_insert(
				*((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )),
				vuc_pixel_SE_fourth_range, 15 );



		// convert to float
		vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 );
		vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 );
		vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 );
		vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 );

		vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 );
		vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 );
		vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 );
		vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 );

		vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 );
		vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 );
		vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 );
		vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 );

		vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 );
		vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 );
		vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 );
		vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 );

		// first linear interpolation: EWtop
		// EWtop = NW + EWweight*(NE-NW)
		//
		// first range
		vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range );
		vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range,
								vf_EWtop_first_range_tmp,
								vf_pixel_NW_first_range );

		// second range
		vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range );
		vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range,
								vf_EWtop_second_range_tmp,
								vf_pixel_NW_second_range );

		// third range
		vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range );
		vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range,
								vf_EWtop_third_range_tmp,
								vf_pixel_NW_third_range );

		// fourth range
		vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range );
		vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range,
								vf_EWtop_fourth_range_tmp,
								vf_pixel_NW_fourth_range );



		// second linear interpolation: EWbottom
		// EWbottom = SW + EWweight*(SE-SW)
		//
		// first range
		vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range );
		vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range,
								vf_EWbottom_first_range_tmp,
								vf_pixel_SW_first_range );

		// second range
		vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range );
		vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range,
								vf_EWbottom_second_range_tmp,
								vf_pixel_SW_second_range );
		// first range
		vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range );
		vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range,
								vf_EWbottom_third_range_tmp,
								vf_pixel_SW_third_range );

		// first range
		vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range );
		vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range,
								vf_EWbottom_fourth_range_tmp,
								vf_pixel_SW_fourth_range );



		// third linear interpolation: the bilinear interpolated value
		// result = EWtop + NSweight*(EWbottom-EWtop);
		//
		// first range
		vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range );
		vector float vf_result_first_range = spu_madd( vf_NSweight,
								vf_result_first_range_tmp,
								vf_EWtop_first_range );

		// second range
		vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range );
		vector float vf_result_second_range = spu_madd( vf_NSweight,
								vf_result_second_range_tmp,
								vf_EWtop_second_range );

		// third range
		vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range );
		vector float vf_result_third_range = spu_madd( vf_NSweight,
								vf_result_third_range_tmp,
								vf_EWtop_third_range );

		// fourth range
		vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range );
		vector float vf_result_fourth_range = spu_madd( vf_NSweight,
								vf_result_fourth_range_tmp,
								vf_EWtop_fourth_range );



		// convert back: using saturated arithmetic
		vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range );
		vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range );
		vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range );
		vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range );

		// merge results->lower,upper
		vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F,
							       		    0x13, 0x17, 0x1B, 0x1F,
							       		    0x00, 0x00, 0x00, 0x00,
							       		    0x00, 0x00, 0x00, 0x00 };

		vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00,
							       		    0x00, 0x00, 0x00, 0x00,
									    0x03, 0x07, 0x0B, 0x0F,
							       		    0x13, 0x17, 0x1B, 0x1F };

		vector unsigned char vuc_result_first_second =
						spu_shuffle( (vector unsigned char) vui_result_first_range,
								 (vector unsigned char) vui_result_second_range,
								vuc_mask_merge_result_first_second );

		vector unsigned char vuc_result_third_fourth =
						spu_shuffle( (vector unsigned char) vui_result_third_range,
								 (vector unsigned char) vui_result_fourth_range,
								vuc_mask_merge_result_third_fourth );

		// store result
		*((vector unsigned char*)dst) = spu_or( vuc_result_first_second,
							vuc_result_third_fourth );
		dst += 16;
	}
}