Mercurial > sdl-ios-xcode
changeset 3148:104786a909a2 gsoc2009_ps3
Scaling (bilinear) of YUV-Textures working.
author | Martin Lowinski <martin@goldtopf.org> |
---|---|
date | Fri, 19 Jun 2009 05:22:00 +0000 |
parents | a80760096937 |
children | b143d794bff1 |
files | src/video/ps3/SDL_ps3render.c src/video/ps3/spulibs/Makefile src/video/ps3/spulibs/bilin_scaler.c |
diffstat | 3 files changed, 2140 insertions(+), 34 deletions(-) [+] |
line wrap: on
line diff
--- a/src/video/ps3/SDL_ps3render.c Thu Jun 18 03:27:12 2009 +0000 +++ b/src/video/ps3/SDL_ps3render.c Fri Jun 19 05:22:00 2009 +0000 @@ -40,6 +40,7 @@ /* Stores the executable name */ extern spe_program_handle_t yuv2rgb_spu; +extern spe_program_handle_t bilin_scaler_spu; /* SDL surface based renderer implementation */ @@ -103,9 +104,13 @@ unsigned int double_buffering; /* SPE threading stuff */ - spu_data_t * converter_thread_data; + spu_data_t *converter_thread_data; + spu_data_t *scaler_thread_data; + /* YUV converting transfer data */ volatile struct yuv2rgb_parms_t * converter_parms __attribute__((aligned(128))); + /* Scaler transfer data */ + volatile struct scale_parms_t * scaler_parms __attribute__((aligned(128))); } SDL_PS3_RenderData; typedef struct @@ -207,7 +212,8 @@ /* Create SPU parms structure */ data->converter_parms = (struct yuv2rgb_parms_t *) memalign(16, sizeof(struct yuv2rgb_parms_t)); - if (data->converter_parms == NULL) { + data->scaler_parms = (struct scale_parms_t *) memalign(16, sizeof(struct scale_parms_t)); + if (data->converter_parms == NULL || data->scaler_parms == NULL) { SDL_PS3_DestroyRenderer(renderer); SDL_OutOfMemory(); return NULL; @@ -215,12 +221,18 @@ /* Set up the SPEs */ data->converter_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t)); - if (data->converter_thread_data == NULL) { + data->scaler_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t)); + if (data->converter_thread_data == NULL || data->scaler_thread_data == NULL) { SDL_PS3_DestroyRenderer(renderer); SDL_OutOfMemory(); return NULL; } + data->scaler_thread_data->program = bilin_scaler_spu; + data->scaler_thread_data->program_name = "bilin_scaler_spu"; + data->scaler_thread_data->keepalive = 0; + data->scaler_thread_data->booted = 0; + data->converter_thread_data->program = yuv2rgb_spu; data->converter_thread_data->program_name = "yuv2rgb_spu"; data->converter_thread_data->keepalive = 1; @@ -289,7 +301,7 @@ break; } if ((texture->format & SDL_PIXELFORMAT_YV12 || texture->format & SDL_PIXELFORMAT_IYUV) - && texture->w % 8 == 0 && texture->h % 8 == 0) { + && texture->w % 16 == 0 && texture->h % 16 == 0) { data->accelerated = 1; } } else { @@ -337,17 +349,16 @@ Uint8 *src, *dst; int row; size_t length; - int dstpitch; Uint8 *dstpixels; src = (Uint8 *) pixels; - dst = (Uint8 *) dstpixels + rect->y * dstpitch + rect->x + dst = (Uint8 *) dstpixels + rect->y * data->pitch + rect->x * SDL_BYTESPERPIXEL(texture->format); length = rect->w * SDL_BYTESPERPIXEL(texture->format); for (row = 0; row < rect->h; ++row) { SDL_memcpy(dst, src, length); src += pitch; - dst += dstpitch; + dst += data->pitch; } } deprintf(1, "-PS3_UpdateTexture()\n"); @@ -495,16 +506,18 @@ deprintf(1, "dstrect->x = %u\n", dstrect->x); deprintf(1, "dstrect->y = %u\n", dstrect->y); + deprintf(1, "texture->w = %u\n", texture->w); + deprintf(1, "texture->h = %u\n", texture->h); + if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) { deprintf(1, "SDL_ISPIXELFORMAT_FOURCC = true\n"); if (txdata->accelerated) { deprintf(1, "Use SPE for scaling/converting\n"); - if (srcrect-> != dstrect->w || srcrect->h != dstrect->h) { - // do scaling - } + SDL_SW_YUVTexture *swdata = (SDL_SW_YUVTexture *) txdata->yuv; Uint8 *lum, *Cr, *Cb; - SDL_SW_YUVTexture *swdata = (SDL_SW_YUVTexture *) txdata->yuv; + Uint8 *scaler_out = NULL; + Uint8 *dstpixels; switch (swdata->format) { case SDL_PIXELFORMAT_YV12: lum = swdata->planes[0]; @@ -520,23 +533,62 @@ return -1; } - data->converter_parms->y_plane = lum; - data->converter_parms->v_plane = Cr; - data->converter_parms->u_plane = Cb; - data->converter_parms->src_pixel_width = swdata->w; - data->converter_parms->src_pixel_height = swdata->h; - data->converter_parms->dstBuffer = (Uint8 *)data->screen->pixels; + if (srcrect->w != dstrect->w || srcrect->h != dstrect->h) { + /* Alloc mem for scaled YUV picture */ + scaler_out = (Uint8 *) memalign(16, dstrect->w * dstrect->h + ((dstrect->w * dstrect->h) >> 1)); + if (scaler_out == NULL) { + SDL_OutOfMemory(); + return -1; + } + + /* Set parms for scaling */ + data->scaler_parms->src_pixel_width = srcrect->w; + data->scaler_parms->src_pixel_height = srcrect->h; + data->scaler_parms->dst_pixel_width = dstrect->w; + data->scaler_parms->dst_pixel_height = dstrect->h; + data->scaler_parms->y_plane = lum; + data->scaler_parms->v_plane = Cr; + data->scaler_parms->u_plane = Cb; + data->scaler_parms->dstBuffer = scaler_out; + data->scaler_thread_data->argp = (void *)data->scaler_parms; + + /* Scale the YUV overlay to given size */ + SPE_Start(data->scaler_thread_data); + SPE_Stop(data->scaler_thread_data); + + /* Set parms for converting after scaling */ + data->converter_parms->y_plane = scaler_out; + data->converter_parms->v_plane = scaler_out + dstrect->w * dstrect->h; + data->converter_parms->u_plane = scaler_out + dstrect->w * dstrect->h + ((dstrect->w * dstrect->h) >> 2); + } else { + data->converter_parms->y_plane = lum; + data->converter_parms->v_plane = Cr; + data->converter_parms->u_plane = Cb; + } + + dstpixels = (Uint8 *) data->screen->pixels + dstrect->y * data->screen->pitch + dstrect->x + * SDL_BYTESPERPIXEL(texture->format); + data->converter_parms->src_pixel_width = dstrect->w; + data->converter_parms->src_pixel_height = dstrect->h; + data->converter_parms->dstBuffer = dstpixels/*(Uint8 *)data->screen->pixels*/; data->converter_thread_data->argp = (void *)data->converter_parms; - /* Convert YUV overlay to RGB */ + /* Convert YUV texture to RGB */ SPE_SendMsg(data->converter_thread_data, SPU_START); SPE_SendMsg(data->converter_thread_data, (unsigned int)data->converter_thread_data->argp); SPE_WaitForMsg(data->converter_thread_data, SPU_FIN); + if (scaler_out) { + free(scaler_out); + } } else { deprintf(1, "Use software for scaling/converting\n"); + Uint8 *dst; + /* FIXME: Not good */ + dst = (Uint8 *) data->screen->pixels + dstrect->y * data->screen->pitch + dstrect->x + * SDL_BYTESPERPIXEL(texture->format); return SDL_SW_CopyYUVToRGB(txdata->yuv, srcrect, display->current_mode.format, - dstrect->w, dstrect->h, data->screen->pixels, + dstrect->w, dstrect->h, dst/*data->screen->pixels*/, data->screen->pitch); } } else { @@ -545,18 +597,16 @@ Uint8 *src, *dst; int row; size_t length; - int dstpitch; Uint8 *dstpixels; src = (Uint8 *) txdata->pixels; - dst = (Uint8 *) data->screen->pixels + dstrect->y * dstpitch + dstrect->x + dst = (Uint8 *) data->screen->pixels + dstrect->y * data->screen->pitch + dstrect->x * SDL_BYTESPERPIXEL(texture->format); length = dstrect->w * SDL_BYTESPERPIXEL(texture->format); for (row = 0; row < dstrect->h; ++row) { - //deprintf(1, "Copying texture to screen...\n"); SDL_memcpy(dst, src, length); - src += dstpitch; - dst += dstpitch; + src += txdata->pitch; + dst += data->screen->pitch; } } @@ -664,6 +714,12 @@ } /* Shutdown SPE and related resources */ + if (data->scaler_thread_data) { + free((void *)data->scaler_thread_data); + } + if (data->scaler_parms) { + free((void *)data->scaler_parms); + } if (data->converter_thread_data) { SPE_Shutdown(data->converter_thread_data); free((void *)data->converter_thread_data);
--- a/src/video/ps3/spulibs/Makefile Thu Jun 18 03:27:12 2009 +0000 +++ b/src/video/ps3/spulibs/Makefile Fri Jun 19 05:22:00 2009 +0000 @@ -16,8 +16,8 @@ all: libfb_writer_spu.a libfb_writer_spu.so \ - libyuv2rgb_spu.so libyuv2rgb_spu.a -# libbilin_scaler_spu.so libbilin_scaler_spu.a + libyuv2rgb_spu.so libyuv2rgb_spu.a \ + libbilin_scaler_spu.so libbilin_scaler_spu.a # fb_writer @@ -56,25 +56,25 @@ $(PPU_LD) -o libbilin_scaler_spu.so -shared -soname=libbilin_scaler_spu.so bilin_scaler_spu-embed.o install: libfb_writer_spu.a libfb_writer_spu.so \ - libyuv2rgb_spu.so libyuv2rgb_spu.a -# libbilin_scaler_spu.so libbilin_scaler_spu.a + libyuv2rgb_spu.so libyuv2rgb_spu.a \ + libbilin_scaler_spu.so libbilin_scaler_spu.a $(INSTALL) -c -m 0755 libfb_writer_spu.so $(PREFIX)/. $(INSTALL) -c -m 0655 libfb_writer_spu.a $(PREFIX)/. $(INSTALL) -c -m 0755 libyuv2rgb_spu.so $(PREFIX)/. $(INSTALL) -c -m 0655 libyuv2rgb_spu.a $(PREFIX)/. -# $(INSTALL) -c -m 0755 libbilin_scaler_spu.so $(PREFIX)/. -# $(INSTALL) -c -m 0655 libbilin_scaler_spu.a $(PREFIX)/. + $(INSTALL) -c -m 0755 libbilin_scaler_spu.so $(PREFIX)/. + $(INSTALL) -c -m 0655 libbilin_scaler_spu.a $(PREFIX)/. uninstall: $(PREFIX)/libfb_writer_spu.so $(PREFIX)/libfb_writer_spu.a \ - $(PREFIX)/libyuv2rgb_spu.so $(PREFIX)/libyuv2rgb_spu.a -# $(PREFIX)/libbilin_scaler_spu.so $(PREFIX)/libbilin_scaler_spu.a + $(PREFIX)/libyuv2rgb_spu.so $(PREFIX)/libyuv2rgb_spu.a \ + $(PREFIX)/libbilin_scaler_spu.so $(PREFIX)/libbilin_scaler_spu.a rm -f $(PREFIX)/libfb_writer_spu.a rm -f $(PREFIX)/libfb_writer_spu.so rm -f $(PREFIX)/libyuv2rgb_spu.so rm -f $(PREFIX)/libyuv2rgb_spu.a -# rm -f $(PREFIX)/libbilin_scaler_spu.so -# rm -f $(PREFIX)/libbilin_scaler_spu.a + rm -f $(PREFIX)/libbilin_scaler_spu.so + rm -f $(PREFIX)/libbilin_scaler_spu.a clean:
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/spulibs/bilin_scaler.c Fri Jun 19 05:22:00 2009 +0000 @@ -0,0 +1,2050 @@ +/* + * SDL - Simple DirectMedia Layer + * CELL BE Support for PS3 Framebuffer + * Copyright (C) 2008, 2009 International Business Machines Corporation + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + * + * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com> + * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com> + * SPE code based on research by: + * Rene Becker + * Thimo Emmerich + */ + +#include "spu_common.h" + +#include <spu_intrinsics.h> +#include <spu_mfcio.h> + +// Debugging +//#define DEBUG + +#ifdef DEBUG +#define deprintf(fmt, args... ) \ + fprintf( stdout, fmt, ##args ); \ + fflush( stdout ); +#else +#define deprintf( fmt, args... ) +#endif + +struct scale_parms_t parms __attribute__((aligned(128))); + +/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored + * there might be the need to retrieve misaligned data, adjust + * incoming v and u plane to be able to handle this (add 128) + */ +unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128))); +unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128))); +unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128))); + +/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */ +unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128))); +unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128))); +unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128))); + +/* some vectors needed by the float to int conversion */ +static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f }; +static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f }; + +void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride); +void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride); + +void scale_srcw16_dstw16(); +void scale_srcw16_dstw32(); +void scale_srcw32_dstw16(); +void scale_srcw32_dstw32(); + +int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp ) +{ + deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id); + /* DMA transfer for the input parameters */ + spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD); + DMA_WAIT_TAG(TAG_INIT); + + deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height, + parms.dst_pixel_width, parms.dst_pixel_height); + + if(parms.src_pixel_width & 0x1f) { + if(parms.dst_pixel_width & 0x1F) { + deprintf("[SPU] Using scale_srcw16_dstw16\n"); + scale_srcw16_dstw16(); + } else { + deprintf("[SPU] Using scale_srcw16_dstw32\n"); + scale_srcw16_dstw32(); + } + } else { + if(parms.dst_pixel_width & 0x1F) { + deprintf("[SPU] Using scale_srcw32_dstw16\n"); + scale_srcw32_dstw16(); + } else { + deprintf("[SPU] Using scale_srcw32_dstw32\n"); + scale_srcw32_dstw32(); + } + } + deprintf("[SPU] bilin_scaler_spu... done!\n"); + + return 0; +} + + +/* + * vfloat_to_vuint() + * + * converts a float vector to an unsinged int vector using saturated + * arithmetic + * + * @param vec_s float vector for conversion + * @returns converted unsigned int vector + */ +inline static vector unsigned int vfloat_to_vuint(vector float vec_s) { + vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); + vec_s = spu_sel(vec_s, vec_0_1, select_1); + + vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); + vec_s = spu_sel(vec_s, vec_255, select_2); + return spu_convtu(vec_s,0); +} + + +/* + * scale_srcw16_dstw16() + * + * processes an input image of width 16 + * scaling is done to a width 16 + * result stored in RAM + */ +void scale_srcw16_dstw16() { + // extract parameters + unsigned char* dst_addr = (unsigned char *)parms.dstBuffer; + + unsigned int src_width = parms.src_pixel_width; + unsigned int src_height = parms.src_pixel_height; + unsigned int dst_width = parms.dst_pixel_width; + unsigned int dst_height = parms.dst_pixel_height; + + // YVU + unsigned int src_linestride_y = src_width; + unsigned int src_dbl_linestride_y = src_width<<1; + unsigned int src_linestride_vu = src_width>>1; + unsigned int src_dbl_linestride_vu = src_width; + + // scaled YVU + unsigned int scaled_src_linestride_y = dst_width; + + // ram addresses + unsigned char* src_addr_y = parms.y_plane; + unsigned char* src_addr_v = parms.v_plane; + unsigned char* src_addr_u = parms.u_plane; + + // for handling misalignment, addresses are precalculated + unsigned char* precalc_src_addr_v = src_addr_v; + unsigned char* precalc_src_addr_u = src_addr_u; + + unsigned int dst_picture_size = dst_width*dst_height; + + // Sizes for destination + unsigned int dst_dbl_linestride_y = dst_width<<1; + unsigned int dst_dbl_linestride_vu = dst_width>>1; + + // Perform address calculation for Y, V and U in main memory with dst_addr as base + unsigned char* dst_addr_main_memory_y = dst_addr; + unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size; + unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2); + + // calculate scale factors + vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width ); + float y_scale = (float)src_height/(float)dst_height; + + // double buffered processing + // buffer switching + unsigned int curr_src_idx = 0; + unsigned int curr_dst_idx = 0; + unsigned int next_src_idx, next_dst_idx; + + // 2 lines y as output, upper and lowerline + unsigned int curr_interpl_y_upper = 0; + unsigned int next_interpl_y_upper; + unsigned int curr_interpl_y_lower, next_interpl_y_lower; + // only 1 line v/u output, both planes have the same dimension + unsigned int curr_interpl_vu = 0; + unsigned int next_interpl_vu; + + // weights, calculated in every loop iteration + vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_y_upper; + vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower; + vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_vu; + + // line indices for the src picture + float curr_src_y_upper = 0.0f, next_src_y_upper; + float curr_src_y_lower, next_src_y_lower; + float curr_src_vu = 0.0f, next_src_vu; + + // line indices for the dst picture + unsigned int dst_y=0, dst_vu=0; + + // offset for the v and u plane to handle misalignement + unsigned int curr_lsoff_v = 0, next_lsoff_v; + unsigned int curr_lsoff_u = 0, next_lsoff_u; + + // calculate lower line indices + curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale; + curr_interpl_y_lower = (unsigned int)curr_src_y_lower; + // lower line weight + vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower ); + + + // start partially double buffered processing + // get initial data, 2 sets of y, 1 set v, 1 set u + mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 ); + mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF, + 0, 0 ); + mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + + /* iteration loop + * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved + * the scaled output is 2 lines y, 1 line v, 1 line u + * the yuv2rgb-converted output is stored to RAM + */ + for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) { + dst_y = dst_vu<<1; + + // calculate next indices + next_src_vu = ((float)dst_vu+1)*y_scale; + next_src_y_upper = ((float)dst_y+2)*y_scale; + next_src_y_lower = ((float)dst_y+3)*y_scale; + + next_interpl_vu = (unsigned int) next_src_vu; + next_interpl_y_upper = (unsigned int) next_src_y_upper; + next_interpl_y_lower = (unsigned int) next_src_y_lower; + + // calculate weight NORTH-SOUTH + vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu ); + vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper ); + vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower ); + + // get next lines + next_src_idx = curr_src_idx^1; + next_dst_idx = curr_dst_idx^1; + + // 4 lines y + mfc_get( y_plane[next_src_idx], + (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + + // 2 lines v + precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu); + next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F; + mfc_get( v_plane[next_src_idx], + ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0, + src_dbl_linestride_vu+(next_lsoff_v<<1), + RETR_BUF+next_src_idx, + 0, 0 ); + // 2 lines u + precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu); + next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F; + mfc_get( u_plane[next_src_idx], + ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0, + src_dbl_linestride_vu+(next_lsoff_v<<1), + RETR_BUF+next_src_idx, + 0, 0 ); + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v, + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u, + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + // Store the result back to main memory into a destination buffer in YUV format + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + //--------------------------------------------------------------------------------------------- + + + // update for next cycle + curr_src_idx = next_src_idx; + curr_dst_idx = next_dst_idx; + + curr_interpl_y_upper = next_interpl_y_upper; + curr_interpl_y_lower = next_interpl_y_lower; + curr_interpl_vu = next_interpl_vu; + + vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper; + vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower; + vf_curr_NSweight_vu = vf_next_NSweight_vu; + + curr_src_y_upper = next_src_y_upper; + curr_src_y_lower = next_src_y_lower; + curr_src_vu = next_src_vu; + + curr_lsoff_v = next_lsoff_v; + curr_lsoff_u = next_lsoff_u; + } + + + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v, + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u, + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + // Store the result back to main memory into a destination buffer in YUV format + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + // wait for completion + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + //--------------------------------------------------------------------------------------------- +} + + +/* + * scale_srcw16_dstw32() + * + * processes an input image of width 16 + * scaling is done to a width 32 + * yuv2rgb conversion on a width of 32 + * result stored in RAM + */ +void scale_srcw16_dstw32() { + // extract parameters + unsigned char* dst_addr = (unsigned char *)parms.dstBuffer; + + unsigned int src_width = parms.src_pixel_width; + unsigned int src_height = parms.src_pixel_height; + unsigned int dst_width = parms.dst_pixel_width; + unsigned int dst_height = parms.dst_pixel_height; + + // YVU + unsigned int src_linestride_y = src_width; + unsigned int src_dbl_linestride_y = src_width<<1; + unsigned int src_linestride_vu = src_width>>1; + unsigned int src_dbl_linestride_vu = src_width; + // scaled YVU + unsigned int scaled_src_linestride_y = dst_width; + + // ram addresses + unsigned char* src_addr_y = parms.y_plane; + unsigned char* src_addr_v = parms.v_plane; + unsigned char* src_addr_u = parms.u_plane; + + unsigned int dst_picture_size = dst_width*dst_height; + + // Sizes for destination + unsigned int dst_dbl_linestride_y = dst_width<<1; + unsigned int dst_dbl_linestride_vu = dst_width>>1; + + // Perform address calculation for Y, V and U in main memory with dst_addr as base + unsigned char* dst_addr_main_memory_y = dst_addr; + unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size; + unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2); + + + // for handling misalignment, addresses are precalculated + unsigned char* precalc_src_addr_v = src_addr_v; + unsigned char* precalc_src_addr_u = src_addr_u; + + // calculate scale factors + vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width ); + float y_scale = (float)src_height/(float)dst_height; + + // double buffered processing + // buffer switching + unsigned int curr_src_idx = 0; + unsigned int curr_dst_idx = 0; + unsigned int next_src_idx, next_dst_idx; + + // 2 lines y as output, upper and lowerline + unsigned int curr_interpl_y_upper = 0; + unsigned int next_interpl_y_upper; + unsigned int curr_interpl_y_lower, next_interpl_y_lower; + // only 1 line v/u output, both planes have the same dimension + unsigned int curr_interpl_vu = 0; + unsigned int next_interpl_vu; + + // weights, calculated in every loop iteration + vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_y_upper; + vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower; + vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_vu; + + // line indices for the src picture + float curr_src_y_upper = 0.0f, next_src_y_upper; + float curr_src_y_lower, next_src_y_lower; + float curr_src_vu = 0.0f, next_src_vu; + + // line indices for the dst picture + unsigned int dst_y=0, dst_vu=0; + + // offset for the v and u plane to handle misalignement + unsigned int curr_lsoff_v = 0, next_lsoff_v; + unsigned int curr_lsoff_u = 0, next_lsoff_u; + + // calculate lower line idices + curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale; + curr_interpl_y_lower = (unsigned int)curr_src_y_lower; + // lower line weight + vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower ); + + + // start partially double buffered processing + // get initial data, 2 sets of y, 1 set v, 1 set u + mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 ); + mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF, + 0, 0 ); + mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + + // iteration loop + // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved + // the scaled output is 2 lines y, 1 line v, 1 line u + // the yuv2rgb-converted output is stored to RAM + for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) { + dst_y = dst_vu<<1; + + // calculate next indices + next_src_vu = ((float)dst_vu+1)*y_scale; + next_src_y_upper = ((float)dst_y+2)*y_scale; + next_src_y_lower = ((float)dst_y+3)*y_scale; + + next_interpl_vu = (unsigned int) next_src_vu; + next_interpl_y_upper = (unsigned int) next_src_y_upper; + next_interpl_y_lower = (unsigned int) next_src_y_lower; + + // calculate weight NORTH-SOUTH + vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu ); + vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper ); + vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower ); + + // get next lines + next_src_idx = curr_src_idx^1; + next_dst_idx = curr_dst_idx^1; + + // 4 lines y + mfc_get( y_plane[next_src_idx], + (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + + // 2 lines v + precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu); + next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F; + mfc_get( v_plane[next_src_idx], + ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0, + src_dbl_linestride_vu+(next_lsoff_v<<1), + RETR_BUF+next_src_idx, + 0, 0 ); + // 2 lines u + precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu); + next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F; + mfc_get( u_plane[next_src_idx], + ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0, + src_dbl_linestride_vu+(next_lsoff_v<<1), + RETR_BUF+next_src_idx, + 0, 0 ); + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v, + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u, + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + //--------------------------------------------------------------------------------------------- + + + // update for next cycle + curr_src_idx = next_src_idx; + curr_dst_idx = next_dst_idx; + + curr_interpl_y_upper = next_interpl_y_upper; + curr_interpl_y_lower = next_interpl_y_lower; + curr_interpl_vu = next_interpl_vu; + + vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper; + vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower; + vf_curr_NSweight_vu = vf_next_NSweight_vu; + + curr_src_y_upper = next_src_y_upper; + curr_src_y_lower = next_src_y_lower; + curr_src_vu = next_src_vu; + + curr_lsoff_v = next_lsoff_v; + curr_lsoff_u = next_lsoff_u; + } + + + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v, + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u, + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + // wait for completion + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + //--------------------------------------------------------------------------------------------- +} + + +/* + * scale_srcw32_dstw16() + * + * processes an input image of width 32 + * scaling is done to a width 16 + * yuv2rgb conversion on a width of 16 + * result stored in RAM + */ +void scale_srcw32_dstw16() { + // extract parameters + unsigned char* dst_addr = (unsigned char *)parms.dstBuffer; + + unsigned int src_width = parms.src_pixel_width; + unsigned int src_height = parms.src_pixel_height; + unsigned int dst_width = parms.dst_pixel_width; + unsigned int dst_height = parms.dst_pixel_height; + + // YVU + unsigned int src_linestride_y = src_width; + unsigned int src_dbl_linestride_y = src_width<<1; + unsigned int src_linestride_vu = src_width>>1; + unsigned int src_dbl_linestride_vu = src_width; + // scaled YVU + unsigned int scaled_src_linestride_y = dst_width; + + // ram addresses + unsigned char* src_addr_y = parms.y_plane; + unsigned char* src_addr_v = parms.v_plane; + unsigned char* src_addr_u = parms.u_plane; + + unsigned int dst_picture_size = dst_width*dst_height; + + // Sizes for destination + unsigned int dst_dbl_linestride_y = dst_width<<1; + unsigned int dst_dbl_linestride_vu = dst_width>>1; + + // Perform address calculation for Y, V and U in main memory with dst_addr as base + unsigned char* dst_addr_main_memory_y = dst_addr; + unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size; + unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2); + + // calculate scale factors + vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width ); + float y_scale = (float)src_height/(float)dst_height; + + // double buffered processing + // buffer switching + unsigned int curr_src_idx = 0; + unsigned int curr_dst_idx = 0; + unsigned int next_src_idx, next_dst_idx; + + // 2 lines y as output, upper and lowerline + unsigned int curr_interpl_y_upper = 0; + unsigned int next_interpl_y_upper; + unsigned int curr_interpl_y_lower, next_interpl_y_lower; + // only 1 line v/u output, both planes have the same dimension + unsigned int curr_interpl_vu = 0; + unsigned int next_interpl_vu; + + // weights, calculated in every loop iteration + vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_y_upper; + vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower; + vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_vu; + + // line indices for the src picture + float curr_src_y_upper = 0.0f, next_src_y_upper; + float curr_src_y_lower, next_src_y_lower; + float curr_src_vu = 0.0f, next_src_vu; + + // line indices for the dst picture + unsigned int dst_y=0, dst_vu=0; + + // calculate lower line idices + curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale; + curr_interpl_y_lower = (unsigned int)curr_src_y_lower; + // lower line weight + vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower ); + + + // start partially double buffered processing + // get initial data, 2 sets of y, 1 set v, 1 set u + mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 ); + mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF, + 0, 0 ); + mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + + // iteration loop + // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved + // the scaled output is 2 lines y, 1 line v, 1 line u + // the yuv2rgb-converted output is stored to RAM + for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) { + dst_y = dst_vu<<1; + + // calculate next indices + next_src_vu = ((float)dst_vu+1)*y_scale; + next_src_y_upper = ((float)dst_y+2)*y_scale; + next_src_y_lower = ((float)dst_y+3)*y_scale; + + next_interpl_vu = (unsigned int) next_src_vu; + next_interpl_y_upper = (unsigned int) next_src_y_upper; + next_interpl_y_lower = (unsigned int) next_src_y_lower; + + // calculate weight NORTH-SOUTH + vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu ); + vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper ); + vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower ); + + // get next lines + next_src_idx = curr_src_idx^1; + next_dst_idx = curr_dst_idx^1; + + // 4 lines y + mfc_get( y_plane[next_src_idx], + (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + + // 2 lines v + mfc_get( v_plane[next_src_idx], + (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu), + src_dbl_linestride_vu, + RETR_BUF+next_src_idx, + 0, 0 ); + // 2 lines u + mfc_get( u_plane[next_src_idx], + (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu), + src_dbl_linestride_vu, + RETR_BUF+next_src_idx, + 0, 0 ); + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w16( v_plane[curr_src_idx], + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w16( u_plane[curr_src_idx], + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + //--------------------------------------------------------------------------------------------- + + + // update for next cycle + curr_src_idx = next_src_idx; + curr_dst_idx = next_dst_idx; + + curr_interpl_y_upper = next_interpl_y_upper; + curr_interpl_y_lower = next_interpl_y_lower; + curr_interpl_vu = next_interpl_vu; + + vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper; + vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower; + vf_curr_NSweight_vu = vf_next_NSweight_vu; + + curr_src_y_upper = next_src_y_upper; + curr_src_y_lower = next_src_y_lower; + curr_src_vu = next_src_vu; + } + + + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w16( v_plane[curr_src_idx], + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w16( u_plane[curr_src_idx], + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + // wait for completion + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + //--------------------------------------------------------------------------------------------- +} + + +/** + * scale_srcw32_dstw32() + * + * processes an input image of width 32 + * scaling is done to a width 32 + * yuv2rgb conversion on a width of 32 + * result stored in RAM + */ +void scale_srcw32_dstw32() { + // extract parameters + unsigned char* dst_addr = (unsigned char *)parms.dstBuffer; + + unsigned int src_width = parms.src_pixel_width; + unsigned int src_height = parms.src_pixel_height; + unsigned int dst_width = parms.dst_pixel_width; + unsigned int dst_height = parms.dst_pixel_height; + + // YVU + unsigned int src_linestride_y = src_width; + unsigned int src_dbl_linestride_y = src_width<<1; + unsigned int src_linestride_vu = src_width>>1; + unsigned int src_dbl_linestride_vu = src_width; + + // scaled YVU + unsigned int scaled_src_linestride_y = dst_width; + + // ram addresses + unsigned char* src_addr_y = parms.y_plane; + unsigned char* src_addr_v = parms.v_plane; + unsigned char* src_addr_u = parms.u_plane; + + unsigned int dst_picture_size = dst_width*dst_height; + + // Sizes for destination + unsigned int dst_dbl_linestride_y = dst_width<<1; + unsigned int dst_dbl_linestride_vu = dst_width>>1; + + // Perform address calculation for Y, V and U in main memory with dst_addr as base + unsigned char* dst_addr_main_memory_y = dst_addr; + unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size; + unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2); + + // calculate scale factors + vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width ); + float y_scale = (float)src_height/(float)dst_height; + + // double buffered processing + // buffer switching + unsigned int curr_src_idx = 0; + unsigned int curr_dst_idx = 0; + unsigned int next_src_idx, next_dst_idx; + + // 2 lines y as output, upper and lowerline + unsigned int curr_interpl_y_upper = 0; + unsigned int next_interpl_y_upper; + unsigned int curr_interpl_y_lower, next_interpl_y_lower; + // only 1 line v/u output, both planes have the same dimension + unsigned int curr_interpl_vu = 0; + unsigned int next_interpl_vu; + + // weights, calculated in every loop iteration + vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_y_upper; + vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower; + vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_vu; + + // line indices for the src picture + float curr_src_y_upper = 0.0f, next_src_y_upper; + float curr_src_y_lower, next_src_y_lower; + float curr_src_vu = 0.0f, next_src_vu; + + // line indices for the dst picture + unsigned int dst_y=0, dst_vu=0; + + // calculate lower line idices + curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale; + curr_interpl_y_lower = (unsigned int)curr_src_y_lower; + // lower line weight + vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower ); + + + // start partially double buffered processing + // get initial data, 2 sets of y, 1 set v, 1 set u + mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 ); + mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF, + 0, 0 ); + mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + + // iteration loop + // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved + // the scaled output is 2 lines y, 1 line v, 1 line u + // the yuv2rgb-converted output is stored to RAM + for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) { + dst_y = dst_vu<<1; + + // calculate next indices + next_src_vu = ((float)dst_vu+1)*y_scale; + next_src_y_upper = ((float)dst_y+2)*y_scale; + next_src_y_lower = ((float)dst_y+3)*y_scale; + + next_interpl_vu = (unsigned int) next_src_vu; + next_interpl_y_upper = (unsigned int) next_src_y_upper; + next_interpl_y_lower = (unsigned int) next_src_y_lower; + + // calculate weight NORTH-SOUTH + vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu ); + vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper ); + vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower ); + + // get next lines + next_src_idx = curr_src_idx^1; + next_dst_idx = curr_dst_idx^1; + + // 4 lines y + mfc_get( y_plane[next_src_idx], + (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + + // 2 lines v + mfc_get( v_plane[next_src_idx], + (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu), + src_dbl_linestride_vu, + RETR_BUF+next_src_idx, + 0, 0 ); + // 2 lines u + mfc_get( u_plane[next_src_idx], + (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu), + src_dbl_linestride_vu, + RETR_BUF+next_src_idx, + 0, 0 ); + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w16( v_plane[curr_src_idx], + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w16( u_plane[curr_src_idx], + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + + // Store the result back to main memory into a destination buffer in YUV format + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + //--------------------------------------------------------------------------------------------- + + + // update for next cycle + curr_src_idx = next_src_idx; + curr_dst_idx = next_dst_idx; + + curr_interpl_y_upper = next_interpl_y_upper; + curr_interpl_y_lower = next_interpl_y_lower; + curr_interpl_vu = next_interpl_vu; + + vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper; + vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower; + vf_curr_NSweight_vu = vf_next_NSweight_vu; + + curr_src_y_upper = next_src_y_upper; + curr_src_y_lower = next_src_y_lower; + curr_src_vu = next_src_vu; + } + + + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w16( v_plane[curr_src_idx], + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w16( u_plane[curr_src_idx], + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + // Store the result back to main memory into a destination buffer in YUV format + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + // wait for completion + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + //--------------------------------------------------------------------------------------------- +} + + +/* + * bilinear_scale_line_w8() + * + * processes a line of yuv-input, width has to be a multiple of 8 + * scaled yuv-output is written to local store buffer + * + * @param src buffer for 2 lines input + * @param dst_ buffer for 1 line output + * @param dst_width the width of the destination line + * @param vf_x_scale a float vector, at each entry is the x_scale-factor + * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line + * @param src_linestride the stride of the srcline + */ +void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) { + + unsigned char* dst = dst_; + + unsigned int dst_x; + for( dst_x=0; dst_x<dst_width; dst_x+=8) { + // address calculation for loading the 4 surrounding pixel of each calculated + // destination pixel + vector unsigned int vui_dst_x_tmp = spu_splats( dst_x ); + // lower range->first 4 pixel + // upper range->next 4 pixel + vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 }; + vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 }; + vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range ); + vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range ); + + // calculate weight EAST-WEST + vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 ); + vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 ); + vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale ); + vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale ); + vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 ); + vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 ); + vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 ); + vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 ); + vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range ); + vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range ); + + // calculate address offset + // + // pixel NORTH WEST + vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range; + vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range; + + // pixel NORTH EAST-->(offpixelNW+1) + vector unsigned int vui_add_1 = { 1, 1, 1, 1 }; + vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 ); + vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 ); + + // SOUTH-WEST-->(offpixelNW+src_linestride) + vector unsigned int vui_srclinestride = spu_splats( src_linestride ); + vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range ); + vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range ); + + // SOUTH-EAST-->(offpixelNW+src_linestride+1) + vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range ); + vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range ); + + // calculate each address + vector unsigned int vui_src_ls = spu_splats( (unsigned int) src ); + vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range ); + vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range ); + vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range ); + vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range ); + + vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range ); + vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range ); + vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range ); + vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range ); + + // get each pixel + // + // scalar load, afterwards insertion into the right position + // NORTH WEST + vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + vector unsigned char vuc_pixel_NW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )), + vuc_pixel_NW_lower_range, 7 ); + vuc_pixel_NW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )), + vuc_pixel_NW_lower_range, 11 ); + vuc_pixel_NW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )), + vuc_pixel_NW_lower_range, 15 ); + + vector unsigned char vuc_pixel_NW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )), + vuc_pixel_NW_upper_range, 7 ); + vuc_pixel_NW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )), + vuc_pixel_NW_upper_range, 11 ); + vuc_pixel_NW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )), + vuc_pixel_NW_upper_range, 15 ); + + // NORTH EAST + vector unsigned char vuc_pixel_NE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )), + vuc_pixel_NE_lower_range, 7 ); + vuc_pixel_NE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )), + vuc_pixel_NE_lower_range, 11 ); + vuc_pixel_NE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )), + vuc_pixel_NE_lower_range, 15 ); + + vector unsigned char vuc_pixel_NE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )), + vuc_pixel_NE_upper_range, 7 ); + vuc_pixel_NE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )), + vuc_pixel_NE_upper_range, 11 ); + vuc_pixel_NE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )), + vuc_pixel_NE_upper_range, 15 ); + + + // SOUTH WEST + vector unsigned char vuc_pixel_SW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )), + vuc_pixel_SW_lower_range, 7 ); + vuc_pixel_SW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )), + vuc_pixel_SW_lower_range, 11 ); + vuc_pixel_SW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )), + vuc_pixel_SW_lower_range, 15 ); + + vector unsigned char vuc_pixel_SW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )), + vuc_pixel_SW_upper_range, 7 ); + vuc_pixel_SW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )), + vuc_pixel_SW_upper_range, 11 ); + vuc_pixel_SW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )), + vuc_pixel_SW_upper_range, 15 ); + + // SOUTH EAST + vector unsigned char vuc_pixel_SE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )), + vuc_pixel_SE_lower_range, 7 ); + vuc_pixel_SE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )), + vuc_pixel_SE_lower_range, 11 ); + vuc_pixel_SE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )), + vuc_pixel_SE_lower_range, 15 ); + + vector unsigned char vuc_pixel_SE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )), + vuc_pixel_SE_upper_range, 7 ); + vuc_pixel_SE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )), + vuc_pixel_SE_upper_range, 11 ); + vuc_pixel_SE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )), + vuc_pixel_SE_upper_range, 15 ); + + + // convert to float + vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 ); + vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 ); + + vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 ); + vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 ); + + vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 ); + vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 ); + + vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 ); + vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 ); + + + + // first linear interpolation: EWtop + // EWtop = NW + EWweight*(NE-NW) + // + // lower range + vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range ); + vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range, + vf_EWtop_lower_range_tmp, + vf_pixel_NW_lower_range ); + + // upper range + vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range ); + vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range, + vf_EWtop_upper_range_tmp, + vf_pixel_NW_upper_range ); + + + + // second linear interpolation: EWbottom + // EWbottom = SW + EWweight*(SE-SW) + // + // lower range + vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range ); + vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range, + vf_EWbottom_lower_range_tmp, + vf_pixel_SW_lower_range ); + + // upper range + vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range ); + vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range, + vf_EWbottom_upper_range_tmp, + vf_pixel_SW_upper_range ); + + + + // third linear interpolation: the bilinear interpolated value + // result = EWtop + NSweight*(EWbottom-EWtop); + // + // lower range + vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range ); + vector float vf_result_lower_range = spu_madd( vf_NSweight, + vf_result_lower_range_tmp, + vf_EWtop_lower_range ); + + // upper range + vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range ); + vector float vf_result_upper_range = spu_madd( vf_NSweight, + vf_result_upper_range_tmp, + vf_EWtop_upper_range ); + + + // convert back: using saturated arithmetic + vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range ); + vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range ); + + // merge results->lower,upper + vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F, + 0x13, 0x17, 0x1B, 0x1F, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 }; + + vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range, + (vector unsigned char) vui_result_upper_range, + vuc_mask_merge_result ); + + // partial storing + vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF }; + + + // get currently stored data + vector unsigned char vuc_orig = *((vector unsigned char*)dst); + + // clear currently stored data + vuc_orig = spu_and( vuc_orig, + spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) ); + + // rotate result according to storing address + vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F ); + + // store result + *((vector unsigned char*)dst) = spu_or( vuc_result, + vuc_orig ); + dst += 8; + } +} + + +/* + * bilinear_scale_line_w16() + * + * processes a line of yuv-input, width has to be a multiple of 16 + * scaled yuv-output is written to local store buffer + * + * @param src buffer for 2 lines input + * @param dst_ buffer for 1 line output + * @param dst_width the width of the destination line + * @param vf_x_scale a float vector, at each entry is the x_scale-factor + * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line + * @param src_linestride the stride of the srcline + */ +void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) { + + unsigned char* dst = dst_; + + unsigned int dst_x; + for( dst_x=0; dst_x<dst_width; dst_x+=16) { + // address calculation for loading the 4 surrounding pixel of each calculated + // destination pixel + vector unsigned int vui_dst_x_tmp = spu_splats( dst_x ); + // parallelised processing + // first range->pixel 1 2 3 4 + // second range->pixel 5 6 7 8 + // third range->pixel 9 10 11 12 + // fourth range->pixel 13 14 15 16 + vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 }; + vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 }; + vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 }; + vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 }; + vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range ); + vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range ); + vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range ); + vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range ); + + // calculate weight EAST-WEST + vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 ); + vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 ); + vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 ); + vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 ); + vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale ); + vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale ); + vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale ); + vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale ); + vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 ); + vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 ); + vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 ); + vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 ); + vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 ); + vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 ); + vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 ); + vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 ); + vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range ); + vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range ); + vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range ); + vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range ); + + // calculate address offset + // + // pixel NORTH WEST + vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range; + vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range; + vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range; + vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range; + + // pixel NORTH EAST-->(offpixelNW+1) + vector unsigned int vui_add_1 = { 1, 1, 1, 1 }; + vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 ); + vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 ); + vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 ); + vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 ); + + // SOUTH-WEST-->(offpixelNW+src_linestride) + vector unsigned int vui_srclinestride = spu_splats( src_linestride ); + vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range ); + vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range ); + vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range ); + vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range ); + + // SOUTH-EAST-->(offpixelNW+src_linestride+1) + vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range ); + vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range ); + vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range ); + vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range ); + + // calculate each address + vector unsigned int vui_src_ls = spu_splats( (unsigned int) src ); + vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range ); + vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range ); + vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range ); + vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range ); + + vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range ); + vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range ); + vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range ); + vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range ); + + vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range ); + vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range ); + vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range ); + vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range ); + + vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range ); + vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range ); + vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range ); + vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range ); + + + // get each pixel + // + // scalar load, afterwards insertion into the right position + // NORTH WEST + // first range + vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + vector unsigned char vuc_pixel_NW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )), + vuc_pixel_NW_first_range, 7 ); + vuc_pixel_NW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )), + vuc_pixel_NW_first_range, 11 ); + vuc_pixel_NW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )), + vuc_pixel_NW_first_range, 15 ); + // second range + vector unsigned char vuc_pixel_NW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )), + vuc_pixel_NW_second_range, 7 ); + vuc_pixel_NW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )), + vuc_pixel_NW_second_range, 11 ); + vuc_pixel_NW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )), + vuc_pixel_NW_second_range, 15 ); + // third range + vector unsigned char vuc_pixel_NW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )), + vuc_pixel_NW_third_range, 7 ); + vuc_pixel_NW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )), + vuc_pixel_NW_third_range, 11 ); + vuc_pixel_NW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )), + vuc_pixel_NW_third_range, 15 ); + // fourth range + vector unsigned char vuc_pixel_NW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )), + vuc_pixel_NW_fourth_range, 7 ); + vuc_pixel_NW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )), + vuc_pixel_NW_fourth_range, 11 ); + vuc_pixel_NW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )), + vuc_pixel_NW_fourth_range, 15 ); + + // NORTH EAST + // first range + vector unsigned char vuc_pixel_NE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )), + vuc_pixel_NE_first_range, 7 ); + vuc_pixel_NE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )), + vuc_pixel_NE_first_range, 11 ); + vuc_pixel_NE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )), + vuc_pixel_NE_first_range, 15 ); + // second range + vector unsigned char vuc_pixel_NE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )), + vuc_pixel_NE_second_range, 7 ); + vuc_pixel_NE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )), + vuc_pixel_NE_second_range, 11 ); + vuc_pixel_NE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )), + vuc_pixel_NE_second_range, 15 ); + // third range + vector unsigned char vuc_pixel_NE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )), + vuc_pixel_NE_third_range, 7 ); + vuc_pixel_NE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )), + vuc_pixel_NE_third_range, 11 ); + vuc_pixel_NE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )), + vuc_pixel_NE_third_range, 15 ); + // fourth range + vector unsigned char vuc_pixel_NE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )), + vuc_pixel_NE_fourth_range, 7 ); + vuc_pixel_NE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )), + vuc_pixel_NE_fourth_range, 11 ); + vuc_pixel_NE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )), + vuc_pixel_NE_fourth_range, 15 ); + + // SOUTH WEST + // first range + vector unsigned char vuc_pixel_SW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )), + vuc_pixel_SW_first_range, 7 ); + vuc_pixel_SW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )), + vuc_pixel_SW_first_range, 11 ); + vuc_pixel_SW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )), + vuc_pixel_SW_first_range, 15 ); + // second range + vector unsigned char vuc_pixel_SW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )), + vuc_pixel_SW_second_range, 7 ); + vuc_pixel_SW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )), + vuc_pixel_SW_second_range, 11 ); + vuc_pixel_SW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )), + vuc_pixel_SW_second_range, 15 ); + // third range + vector unsigned char vuc_pixel_SW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )), + vuc_pixel_SW_third_range, 7 ); + vuc_pixel_SW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )), + vuc_pixel_SW_third_range, 11 ); + vuc_pixel_SW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )), + vuc_pixel_SW_third_range, 15 ); + // fourth range + vector unsigned char vuc_pixel_SW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )), + vuc_pixel_SW_fourth_range, 7 ); + vuc_pixel_SW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )), + vuc_pixel_SW_fourth_range, 11 ); + vuc_pixel_SW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )), + vuc_pixel_SW_fourth_range, 15 ); + + // NORTH EAST + // first range + vector unsigned char vuc_pixel_SE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )), + vuc_pixel_SE_first_range, 7 ); + vuc_pixel_SE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )), + vuc_pixel_SE_first_range, 11 ); + vuc_pixel_SE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )), + vuc_pixel_SE_first_range, 15 ); + // second range + vector unsigned char vuc_pixel_SE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )), + vuc_pixel_SE_second_range, 7 ); + vuc_pixel_SE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )), + vuc_pixel_SE_second_range, 11 ); + vuc_pixel_SE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )), + vuc_pixel_SE_second_range, 15 ); + // third range + vector unsigned char vuc_pixel_SE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )), + vuc_pixel_SE_third_range, 7 ); + vuc_pixel_SE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )), + vuc_pixel_SE_third_range, 11 ); + vuc_pixel_SE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )), + vuc_pixel_SE_third_range, 15 ); + // fourth range + vector unsigned char vuc_pixel_SE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )), + vuc_pixel_SE_fourth_range, 7 ); + vuc_pixel_SE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )), + vuc_pixel_SE_fourth_range, 11 ); + vuc_pixel_SE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )), + vuc_pixel_SE_fourth_range, 15 ); + + + + // convert to float + vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 ); + vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 ); + vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 ); + vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 ); + + vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 ); + vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 ); + vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 ); + vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 ); + + vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 ); + vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 ); + vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 ); + vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 ); + + vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 ); + vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 ); + vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 ); + vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 ); + + // first linear interpolation: EWtop + // EWtop = NW + EWweight*(NE-NW) + // + // first range + vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range ); + vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range, + vf_EWtop_first_range_tmp, + vf_pixel_NW_first_range ); + + // second range + vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range ); + vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range, + vf_EWtop_second_range_tmp, + vf_pixel_NW_second_range ); + + // third range + vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range ); + vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range, + vf_EWtop_third_range_tmp, + vf_pixel_NW_third_range ); + + // fourth range + vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range ); + vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range, + vf_EWtop_fourth_range_tmp, + vf_pixel_NW_fourth_range ); + + + + // second linear interpolation: EWbottom + // EWbottom = SW + EWweight*(SE-SW) + // + // first range + vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range ); + vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range, + vf_EWbottom_first_range_tmp, + vf_pixel_SW_first_range ); + + // second range + vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range ); + vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range, + vf_EWbottom_second_range_tmp, + vf_pixel_SW_second_range ); + // first range + vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range ); + vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range, + vf_EWbottom_third_range_tmp, + vf_pixel_SW_third_range ); + + // first range + vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range ); + vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range, + vf_EWbottom_fourth_range_tmp, + vf_pixel_SW_fourth_range ); + + + + // third linear interpolation: the bilinear interpolated value + // result = EWtop + NSweight*(EWbottom-EWtop); + // + // first range + vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range ); + vector float vf_result_first_range = spu_madd( vf_NSweight, + vf_result_first_range_tmp, + vf_EWtop_first_range ); + + // second range + vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range ); + vector float vf_result_second_range = spu_madd( vf_NSweight, + vf_result_second_range_tmp, + vf_EWtop_second_range ); + + // third range + vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range ); + vector float vf_result_third_range = spu_madd( vf_NSweight, + vf_result_third_range_tmp, + vf_EWtop_third_range ); + + // fourth range + vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range ); + vector float vf_result_fourth_range = spu_madd( vf_NSweight, + vf_result_fourth_range_tmp, + vf_EWtop_fourth_range ); + + + + // convert back: using saturated arithmetic + vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range ); + vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range ); + vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range ); + vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range ); + + // merge results->lower,upper + vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F, + 0x13, 0x17, 0x1B, 0x1F, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 }; + + vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x03, 0x07, 0x0B, 0x0F, + 0x13, 0x17, 0x1B, 0x1F }; + + vector unsigned char vuc_result_first_second = + spu_shuffle( (vector unsigned char) vui_result_first_range, + (vector unsigned char) vui_result_second_range, + vuc_mask_merge_result_first_second ); + + vector unsigned char vuc_result_third_fourth = + spu_shuffle( (vector unsigned char) vui_result_third_range, + (vector unsigned char) vui_result_fourth_range, + vuc_mask_merge_result_third_fourth ); + + // store result + *((vector unsigned char*)dst) = spu_or( vuc_result_first_second, + vuc_result_third_fourth ); + dst += 16; + } +} +