Mercurial > sdl-ios-xcode
changeset 3257:94fb40a4a9a7
Merged Martin's code changes from Google Summer of Code 2009
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Mon, 07 Sep 2009 04:51:29 +0000 |
parents | 83c87f2b2aab |
children | e786366ea23b |
files | Makefile.in README.PS3 configure.in include/SDL_config.h.in src/video/SDL_sysvideo.h src/video/SDL_video.c src/video/SDL_yuv_sw.c src/video/SDL_yuv_sw_c.h src/video/ps3/SDL_ps3events.c src/video/ps3/SDL_ps3events_c.h src/video/ps3/SDL_ps3modes.c src/video/ps3/SDL_ps3modes_c.h src/video/ps3/SDL_ps3render.c src/video/ps3/SDL_ps3render_c.h src/video/ps3/SDL_ps3spe.c src/video/ps3/SDL_ps3spe_c.h src/video/ps3/SDL_ps3video.c src/video/ps3/SDL_ps3video.h src/video/ps3/spulibs/Makefile src/video/ps3/spulibs/bilin_scaler.c src/video/ps3/spulibs/fb_writer.c src/video/ps3/spulibs/spu_common.h src/video/ps3/spulibs/yuv2rgb.c |
diffstat | 23 files changed, 4744 insertions(+), 26 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile.in Sun Sep 06 15:04:38 2009 +0000 +++ b/Makefile.in Mon Sep 07 04:51:29 2009 +0000 @@ -40,6 +40,11 @@ SDLMAIN_SOURCES = @SDLMAIN_SOURCES@ SDLMAIN_OBJECTS = @SDLMAIN_OBJECTS@ +# PS3 SPU programs +SPU_GCC = @SPU_GCC@ +EMBEDSPU = @EMBEDSPU@ +include $(srcdir)/src/video/ps3/spulibs/Makefile + DIST = acinclude.m4 autogen.sh Borland.html Borland.zip BUGS build-scripts configure configure.in COPYING CREDITS docs docs.html include INSTALL Makefile.dc Makefile.minimal Makefile.in README* sdl-config.in sdl.m4 sdl.pc.in SDL.qpg.in SDL.spec SDL.spec.in src test TODO VisualC.html VisualC VisualCE Watcom-OS2.zip Watcom-Win32.zip WhatsNew Xcode HDRS = SDL.h SDL_atomic.h SDL_audio.h SDL_cdrom.h SDL_compat.h SDL_cpuinfo.h SDL_endian.h SDL_error.h SDL_events.h SDL_haptic.h SDL_joystick.h SDL_keyboard.h SDL_keysym.h SDL_loadso.h SDL_main.h SDL_mouse.h SDL_mutex.h SDL_name.h SDL_opengl.h SDL_opengles.h SDL_pixels.h SDL_platform.h SDL_power.h SDL_quit.h SDL_rect.h SDL_revision.h SDL_rwops.h SDL_scancode.h SDL_stdinc.h SDL_surface.h SDL_syswm.h SDL_thread.h SDL_timer.h SDL_types.h SDL_version.h SDL_video.h begin_code.h close_code.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.PS3 Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,35 @@ + +SDL on Sony Playstation3 +------------------------ + +Installation: + First, you have to install the Cell SDK + - Download the Cell SDK installer RPM and ISO images to + a temporary directory such as /tmp/cellsdk. + - Mount the image: mount -o loop CellSDK-Devel-Fedora_3.1.0.0.0.iso /tmp/cellsdk + - Install the SDK installer: rpm -ivh cell-install-3.1.0-0.0.noarch.rpm + - Install the SDK: cd /opt/cell && ./cellsdk --iso /tmp/cellsdkiso install + + You'll than need to install the SPU-libs + - Run make ps3-libs && make ps3libs-install + + Finally, install SDL + - Go to SDL-1.2/ and build SDL like any other GNU style package. + e.g. + - Build the configure-script with ./autogen.sh + - Configure SDL for your needs: ./configure --enable-video-ps3 ... + - Build and install it: make && make install + + +Todo: + - Mouse & Keyboard support + - On SPU-side the current scaler and converter restrictions are: + - resolution has to be a multiple of 8 (will work on that) + - scaler/converter only supports the YV12 and IYUV format + - the scaler works only bilinear (lanzos would be nice) + - Optimize the SPU-program handling on the PPE side + - Integrate spumedia in SDL + +Have fun! + Dirk Herrendoerfer <d.herrendoerfer [at] de [dot ibm [dot] com> +
--- a/configure.in Sun Sep 06 15:04:38 2009 +0000 +++ b/configure.in Mon Sep 07 04:51:29 2009 +0000 @@ -1509,6 +1509,46 @@ fi } +dnl See if we're running on PlayStation 3 Cell hardware +CheckPS3() +{ + AC_ARG_ENABLE(video-ps3, + AC_HELP_STRING([--enable-video-ps3], [use PlayStation 3 Cell driver [[default=yes]]]), + , enable_video_ps3=yes) + if test x$enable_video = xyes -a x$enable_video_ps3 = xyes; then + video_ps3=no + AC_CHECK_HEADER([linux/fb.h]) + AC_CHECK_HEADER([asm/ps3fb.h], [have_ps3fb_hdr=yes], [], + [#ifndef _LINUX_TYPES_H + #include <linux/types.h> + #endif]) + AC_CHECK_HEADER([libspe2.h], have_libspe2_hdr=yes) + AC_CHECK_LIB([spe2], spe_context_create, have_spe2_lib=yes) + + AC_CHECK_PROGS(SPU_GCC, [spu-gcc]) + AC_CHECK_PROGS(EMBEDSPU, [embedspu]) + + have_spu_libs=yes + AC_CHECK_LIB([fb_writer_spu], [main], [], [have_spu_libs=no]) + AC_CHECK_LIB([yuv2rgb_spu], [main], [], [have_spu_libs=no]) + AC_CHECK_LIB([bilin_scaler_spu], [main], [], [have_spu_libs=no]) + if test x$have_ps3fb_hdr = xyes -a x$have_libspe2_hdr = xyes -a x$have_spe2_lib = xyes -a "$SPU_GCC" -a "$EMBEDSPU"; then + AC_DEFINE(SDL_VIDEO_DRIVER_PS3) + video_ps3=yes + have_video=yes + SOURCES="$SOURCES $srcdir/src/video/ps3/*.c" + EXTRA_CFLAGS="$EXTRA_CFLAGS -I/opt/cell/sdk/usr/include" + EXTRA_LDFLAGS="$EXTRA_LDFLAGS -L/opt/cell/sdk/usr/lib -lspe2 -lfb_writer_spu -lyuv2rgb_spu -lbilin_scaler_spu" + + if test x$have_spu_libs = xno; then + AC_MSG_WARN([ps3libs missing, please run make ps3libs]) + fi + fi + AC_MSG_CHECKING([for PlayStation 3 Cell support]) + AC_MSG_RESULT([$video_ps3]) + fi +} + dnl Find the SVGAlib includes and libraries CheckSVGA() { @@ -2401,6 +2441,7 @@ CheckDirectFB CheckFusionSound CheckPS2GS + CheckPS3 CheckSVGA CheckVGL CheckWscons
--- a/include/SDL_config.h.in Sun Sep 06 15:04:38 2009 +0000 +++ b/include/SDL_config.h.in Mon Sep 07 04:51:29 2009 +0000 @@ -273,6 +273,7 @@ #undef SDL_VIDEO_DRIVER_PHOTON #undef SDL_VIDEO_DRIVER_QNXGF #undef SDL_VIDEO_DRIVER_PS2GS +#undef SDL_VIDEO_DRIVER_PS3 #undef SDL_VIDEO_DRIVER_RISCOS #undef SDL_VIDEO_DRIVER_SVGALIB #undef SDL_VIDEO_DRIVER_VGL
--- a/src/video/SDL_sysvideo.h Sun Sep 06 15:04:38 2009 +0000 +++ b/src/video/SDL_sysvideo.h Mon Sep 07 04:51:29 2009 +0000 @@ -359,6 +359,9 @@ #if SDL_VIDEO_DRIVER_PS2GS extern VideoBootStrap PS2GS_bootstrap; #endif +#if SDL_VIDEO_DRIVER_PS3 +extern VideoBootStrap PS3_bootstrap; +#endif #if SDL_VIDEO_DRIVER_VGL extern VideoBootStrap VGL_bootstrap; #endif
--- a/src/video/SDL_video.c Sun Sep 06 15:04:38 2009 +0000 +++ b/src/video/SDL_video.c Mon Sep 07 04:51:29 2009 +0000 @@ -73,6 +73,9 @@ #if SDL_VIDEO_DRIVER_PS2GS &PS2GS_bootstrap, #endif +#if SDL_VIDEO_DRIVER_PS3 + &PS3_bootstrap, +#endif #if SDL_VIDEO_DRIVER_VGL &VGL_bootstrap, #endif
--- a/src/video/SDL_yuv_sw.c Sun Sep 06 15:04:38 2009 +0000 +++ b/src/video/SDL_yuv_sw.c Mon Sep 07 04:51:29 2009 +0000 @@ -88,32 +88,6 @@ #include "SDL_yuv_sw_c.h" -struct SDL_SW_YUVTexture -{ - Uint32 format; - Uint32 target_format; - int w, h; - Uint8 *pixels; - int *colortab; - Uint32 *rgb_2_pix; - void (*Display1X) (int *colortab, Uint32 * rgb_2_pix, - unsigned char *lum, unsigned char *cr, - unsigned char *cb, unsigned char *out, - int rows, int cols, int mod); - void (*Display2X) (int *colortab, Uint32 * rgb_2_pix, - unsigned char *lum, unsigned char *cr, - unsigned char *cb, unsigned char *out, - int rows, int cols, int mod); - - /* These are just so we don't have to allocate them separately */ - Uint16 pitches[3]; - Uint8 *planes[3]; - - /* This is a temporary surface in case we have to stretch copy */ - SDL_Surface *stretch; - SDL_Surface *display; -}; - /* The colorspace conversion functions */ #if (__GNUC__ > 2) && defined(__i386__) && __OPTIMIZE__ && SDL_ASSEMBLY_ROUTINES
--- a/src/video/SDL_yuv_sw_c.h Sun Sep 06 15:04:38 2009 +0000 +++ b/src/video/SDL_yuv_sw_c.h Mon Sep 07 04:51:29 2009 +0000 @@ -26,6 +26,32 @@ /* This is the software implementation of the YUV texture support */ +struct SDL_SW_YUVTexture +{ + Uint32 format; + Uint32 target_format; + int w, h; + Uint8 *pixels; + int *colortab; + Uint32 *rgb_2_pix; + void (*Display1X) (int *colortab, Uint32 * rgb_2_pix, + unsigned char *lum, unsigned char *cr, + unsigned char *cb, unsigned char *out, + int rows, int cols, int mod); + void (*Display2X) (int *colortab, Uint32 * rgb_2_pix, + unsigned char *lum, unsigned char *cr, + unsigned char *cb, unsigned char *out, + int rows, int cols, int mod); + + /* These are just so we don't have to allocate them separately */ + Uint16 pitches[3]; + Uint8 *planes[3]; + + /* This is a temporary surface in case we have to stretch copy */ + SDL_Surface *stretch; + SDL_Surface *display; +}; + typedef struct SDL_SW_YUVTexture SDL_SW_YUVTexture; SDL_SW_YUVTexture *SDL_SW_CreateYUVTexture(Uint32 format, int w, int h);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3events.c Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,36 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ +#include "SDL_config.h" + +#include "../../events/SDL_sysevents.h" +#include "../../events/SDL_events_c.h" + +#include "SDL_ps3video.h" +#include "SDL_ps3events_c.h" + +void +PS3_PumpEvents(_THIS) +{ + /* do nothing. */ +} + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3events_c.h Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,28 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ +#include "SDL_config.h" + +#include "SDL_ps3video.h" + +extern void PS3_PumpEvents(_THIS); + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3modes.c Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,141 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ +#include "SDL_config.h" + +#include "SDL_ps3video.h" + +void +PS3_InitModes(_THIS) +{ + deprintf(1, "+PS3_InitModes()\n"); + SDL_VideoDisplay display; + SDL_VideoData *data = (SDL_VideoData *) _this->driverdata; + SDL_DisplayMode mode; + PS3_DisplayModeData *modedata; + unsigned long vid = 0; + + modedata = (PS3_DisplayModeData *) SDL_malloc(sizeof(*modedata)); + if (!modedata) { + return; + } + + /* Setting up the DisplayMode based on current settings */ + struct ps3fb_ioctl_res res; + if (ioctl(data->fbdev, PS3FB_IOCTL_SCREENINFO, &res)) { + SDL_SetError("Can't get PS3FB_IOCTL_SCREENINFO"); + } + mode.format = SDL_PIXELFORMAT_RGB888; + mode.refresh_rate = 0; + mode.w = res.xres; + mode.h = res.yres; + + /* Setting up driver specific mode data, + * Get the current ps3 specific videmode number */ + if (ioctl(data->fbdev, PS3FB_IOCTL_GETMODE, (unsigned long)&vid)) { + SDL_SetError("Can't get PS3FB_IOCTL_GETMODE"); + } + deprintf(2, "PS3FB_IOCTL_GETMODE = %u\n", vid); + modedata->mode = vid; + mode.driverdata = modedata; + + /* Set display's videomode and add it */ + SDL_zero(display); + display.desktop_mode = mode; + display.current_mode = mode; + + SDL_AddVideoDisplay(&display); + deprintf(1, "-PS3_InitModes()\n"); +} + +/* DisplayModes available on the PS3 */ +static SDL_DisplayMode ps3fb_modedb[] = { + /* VESA */ + {SDL_PIXELFORMAT_RGB888, 1280, 768, 0, NULL}, // WXGA + {SDL_PIXELFORMAT_RGB888, 1280, 1024, 0, NULL}, // SXGA + {SDL_PIXELFORMAT_RGB888, 1920, 1200, 0, NULL}, // WUXGA + /* Native resolutions (progressive, "fullscreen") */ + {SDL_PIXELFORMAT_RGB888, 720, 480, 0, NULL}, // 480p + {SDL_PIXELFORMAT_RGB888, 1280, 720, 0, NULL}, // 720p + {SDL_PIXELFORMAT_RGB888, 1920, 1080, 0, NULL} // 1080p +}; + +/* PS3 videomode number according to ps3fb_modedb */ +static PS3_DisplayModeData ps3fb_data[] = { + {11}, {12}, {13}, {130}, {131}, {133}, +}; + +void +PS3_GetDisplayModes(_THIS) { + deprintf(1, "+PS3_GetDisplayModes()\n"); + SDL_DisplayMode mode; + unsigned int nummodes; + + nummodes = sizeof(ps3fb_modedb) / sizeof(SDL_DisplayMode); + + int n; + for (n=0; n<nummodes; ++n) { + /* Get driver specific mode data */ + ps3fb_modedb[n].driverdata = &ps3fb_data[n]; + + /* Add DisplayMode to list */ + deprintf(2, "Adding resolution %u x %u\n", ps3fb_modedb[n].w, ps3fb_modedb[n].h); + SDL_AddDisplayMode(_this->current_display, &ps3fb_modedb[n]); + } + deprintf(1, "-PS3_GetDisplayModes()\n"); +} + +int +PS3_SetDisplayMode(_THIS, SDL_DisplayMode * mode) +{ + deprintf(1, "+PS3_SetDisplayMode()\n"); + SDL_VideoData *data = (SDL_VideoData *) _this->driverdata; + PS3_DisplayModeData *dispdata = (PS3_DisplayModeData *) mode->driverdata; + + /* Set the new DisplayMode */ + deprintf(2, "Setting PS3FB_MODE to %u\n", dispdata->mode); + if (ioctl(data->fbdev, PS3FB_IOCTL_SETMODE, (unsigned long)&dispdata->mode)) { + deprintf(2, "Could not set PS3FB_MODE\n"); + SDL_SetError("Could not set PS3FB_MODE\n"); + return -1; + } + + deprintf(1, "-PS3_SetDisplayMode()\n"); + return 0; +} + +void +PS3_QuitModes(_THIS) { + deprintf(1, "+PS3_QuitModes()\n"); + + /* There was no mem allocated for driverdata */ + int i, j; + for (i = _this->num_displays; i--;) { + SDL_VideoDisplay *display = &_this->displays[i]; + for (j = display->num_display_modes; j--;) { + display->display_modes[j].driverdata = NULL; + } + } + + deprintf(1, "-PS3_QuitModes()\n"); +} + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3modes_c.h Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,34 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ +#include "SDL_config.h" + +#ifndef _SDL_ps3modes_h +#define _SDL_ps3modes_h + +extern void PS3_InitModes(_THIS); +extern void PS3_GetDisplayModes(_THIS); +extern int PS3_SetDisplayMode(_THIS, SDL_DisplayMode * mode); +extern void PS3_QuitModes(_THIS); + +#endif /* SDL_ps3modes_h */ + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3render.c Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,746 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ +#include "SDL_config.h" + +#include "SDL_video.h" +#include "../SDL_sysvideo.h" +#include "../SDL_yuv_sw_c.h" +#include "../SDL_renderer_sw.h" + +#include "SDL_ps3video.h" +#include "SDL_ps3spe_c.h" + +#include <fcntl.h> +#include <stdlib.h> +#include <sys/ioctl.h> +#include <linux/kd.h> +#include <linux/fb.h> +#include <sys/mman.h> +#include <asm/ps3fb.h> + + +/* Stores the executable name */ +extern spe_program_handle_t yuv2rgb_spu; +extern spe_program_handle_t bilin_scaler_spu; + +/* SDL surface based renderer implementation */ +static SDL_Renderer *SDL_PS3_CreateRenderer(SDL_Window * window, + Uint32 flags); +static int SDL_PS3_DisplayModeChanged(SDL_Renderer * renderer); +static int SDL_PS3_ActivateRenderer(SDL_Renderer * renderer); +static int SDL_PS3_RenderPoint(SDL_Renderer * renderer, int x, int y); +static int SDL_PS3_RenderLine(SDL_Renderer * renderer, int x1, int y1, + int x2, int y2); +static int SDL_PS3_RenderFill(SDL_Renderer * renderer, + const SDL_Rect * rect); +static int SDL_PS3_RenderCopy(SDL_Renderer * renderer, + SDL_Texture * texture, + const SDL_Rect * srcrect, + const SDL_Rect * dstrect); +static void SDL_PS3_RenderPresent(SDL_Renderer * renderer); +static void SDL_PS3_DestroyRenderer(SDL_Renderer * renderer); + +/* Texture */ +static int PS3_CreateTexture(SDL_Renderer * renderer, SDL_Texture * texture); +static int PS3_QueryTexturePixels(SDL_Renderer * renderer, SDL_Texture * texture, void **pixels, int *pitch); +static int PS3_UpdateTexture(SDL_Renderer * renderer, SDL_Texture * texture, const SDL_Rect * rect, const void *pixels, int pitch); +static int PS3_LockTexture(SDL_Renderer * renderer, SDL_Texture * texture, const SDL_Rect * rect, int markDirty, void **pixels, int *pitch); +static void PS3_UnlockTexture(SDL_Renderer * renderer, SDL_Texture * texture); +static void PS3_DestroyTexture(SDL_Renderer * renderer, SDL_Texture * texture); + + +SDL_RenderDriver SDL_PS3_RenderDriver = { + SDL_PS3_CreateRenderer, + { + "ps3", + (SDL_RENDERER_SINGLEBUFFER | SDL_RENDERER_PRESENTVSYNC | + SDL_RENDERER_PRESENTFLIP2 | SDL_RENDERER_PRESENTDISCARD | + SDL_RENDERER_ACCELERATED), + (SDL_TEXTUREMODULATE_NONE), + (SDL_BLENDMODE_NONE), + /* We use bilinear scaling on the SPE for YV12 & IYUV + * (width and height % 8 = 0) */ + (SDL_TEXTURESCALEMODE_SLOW) + } +}; + +typedef struct +{ + int current_screen; + SDL_Surface *screen; + SDL_VideoDisplay *display; + /* adress of the centered image in the framebuffer (double buffered) */ + uint8_t *center[2]; + + /* width of input (bounded by writeable width) */ + unsigned int bounded_width; + /* height of input (bounded by writeable height) */ + unsigned int bounded_height; + /* offset from the left side (used for centering) */ + unsigned int offset_left; + /* offset from the upper side (used for centering) */ + unsigned int offset_top; + /* width of screen which is writeable */ + unsigned int wr_width; + /* width of screen which is writeable */ + unsigned int wr_height; + /* size of a screen line: width * bpp/8 */ + unsigned int line_length; + + /* Is the kernels fb size bigger than ~12MB + * double buffering will work for 1080p */ + unsigned int double_buffering; + + /* SPE threading stuff */ + spu_data_t *converter_thread_data; + spu_data_t *scaler_thread_data; + + /* YUV converting transfer data */ + volatile struct yuv2rgb_parms_t * converter_parms __attribute__((aligned(128))); + /* Scaler transfer data */ + volatile struct scale_parms_t * scaler_parms __attribute__((aligned(128))); +} SDL_PS3_RenderData; + +typedef struct +{ + int pitch; + /* Image data */ + volatile void *pixels; + /* Use software renderer for not supported formats */ + SDL_SW_YUVTexture *yuv; +} PS3_TextureData; + +SDL_Renderer * +SDL_PS3_CreateRenderer(SDL_Window * window, Uint32 flags) +{ + deprintf(1, "+SDL_PS3_CreateRenderer()\n"); + SDL_VideoDisplay *display = SDL_GetDisplayFromWindow(window); + SDL_DisplayMode *displayMode = &display->current_mode; + SDL_VideoData *devdata = display->device->driverdata; + SDL_Renderer *renderer; + SDL_PS3_RenderData *data; + struct ps3fb_ioctl_res res; + int i, n; + int bpp; + Uint32 Rmask, Gmask, Bmask, Amask; + + if (!SDL_PixelFormatEnumToMasks + (displayMode->format, &bpp, &Rmask, &Gmask, &Bmask, &Amask)) { + SDL_SetError("Unknown display format"); + return NULL; + } + + renderer = (SDL_Renderer *) SDL_calloc(1, sizeof(*renderer)); + if (!renderer) { + SDL_OutOfMemory(); + return NULL; + } + + data = (SDL_PS3_RenderData *) SDL_malloc(sizeof(*data)); + if (!data) { + SDL_PS3_DestroyRenderer(renderer); + SDL_OutOfMemory(); + return NULL; + } + SDL_zerop(data); + + renderer->CreateTexture = PS3_CreateTexture; + renderer->DestroyTexture = PS3_DestroyTexture; + renderer->QueryTexturePixels = PS3_QueryTexturePixels; + renderer->UpdateTexture = PS3_UpdateTexture; + renderer->LockTexture = PS3_LockTexture; + renderer->UnlockTexture = PS3_UnlockTexture; + renderer->ActivateRenderer = SDL_PS3_ActivateRenderer; + renderer->DisplayModeChanged = SDL_PS3_DisplayModeChanged; + renderer->RenderPoint = SDL_PS3_RenderPoint; + renderer->RenderLine = SDL_PS3_RenderLine; + renderer->RenderFill = SDL_PS3_RenderFill; + renderer->RenderCopy = SDL_PS3_RenderCopy; + renderer->RenderPresent = SDL_PS3_RenderPresent; + renderer->DestroyRenderer = SDL_PS3_DestroyRenderer; + renderer->info.name = SDL_PS3_RenderDriver.info.name; + renderer->info.flags = 0; + renderer->window = window->id; + renderer->driverdata = data; + + deprintf(1, "window->w = %u\n", window->w); + deprintf(1, "window->h = %u\n", window->h); + + data->double_buffering = 0; + + /* Get ps3 screeninfo */ + if (ioctl(devdata->fbdev, PS3FB_IOCTL_SCREENINFO, (unsigned long)&res) < 0) { + SDL_SetError("[PS3] PS3FB_IOCTL_SCREENINFO failed"); + } + deprintf(2, "res.num_frames = %d\n", res.num_frames); + + /* Only use double buffering if enough fb memory is available */ + if (res.num_frames > 1) { + renderer->info.flags |= SDL_RENDERER_PRESENTFLIP2; + n = 2; + data->double_buffering = 1; + } else { + renderer->info.flags |= SDL_RENDERER_PRESENTCOPY; + n = 1; + } + + data->screen = + SDL_CreateRGBSurface(0, window->w, window->h, bpp, Rmask, Gmask, + Bmask, Amask); + if (!data->screen) { + SDL_PS3_DestroyRenderer(renderer); + return NULL; + } + /* Allocate aligned memory for pixels */ + SDL_free(data->screen->pixels); + data->screen->pixels = (void *)memalign(16, data->screen->h * data->screen->pitch); + if (!data->screen->pixels) { + SDL_FreeSurface(data->screen); + SDL_OutOfMemory(); + return NULL; + } + SDL_memset(data->screen->pixels, 0, data->screen->h * data->screen->pitch); + SDL_SetSurfacePalette(data->screen, display->palette); + + data->current_screen = 0; + + /* Create SPU parms structure */ + data->converter_parms = (struct yuv2rgb_parms_t *) memalign(16, sizeof(struct yuv2rgb_parms_t)); + data->scaler_parms = (struct scale_parms_t *) memalign(16, sizeof(struct scale_parms_t)); + if (data->converter_parms == NULL || data->scaler_parms == NULL) { + SDL_PS3_DestroyRenderer(renderer); + SDL_OutOfMemory(); + return NULL; + } + + /* Set up the SPE threading data */ + data->converter_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t)); + data->scaler_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t)); + if (data->converter_thread_data == NULL || data->scaler_thread_data == NULL) { + SDL_PS3_DestroyRenderer(renderer); + SDL_OutOfMemory(); + return NULL; + } + + /* Set up the SPE scaler (booted) */ + data->scaler_thread_data->program = bilin_scaler_spu; + data->scaler_thread_data->program_name = "bilin_scaler_spu"; + data->scaler_thread_data->keepalive = 0; + data->scaler_thread_data->booted = 0; + + /* Set up the SPE converter (always running) */ + data->converter_thread_data->program = yuv2rgb_spu; + data->converter_thread_data->program_name = "yuv2rgb_spu"; + data->converter_thread_data->keepalive = 1; + data->converter_thread_data->booted = 0; + + SPE_Start(data->converter_thread_data); + + deprintf(1, "-SDL_PS3_CreateRenderer()\n"); + return renderer; +} + +static int +SDL_PS3_ActivateRenderer(SDL_Renderer * renderer) +{ + deprintf(1, "+PS3_ActivateRenderer()\n"); + SDL_PS3_RenderData *data = (SDL_PS3_RenderData *) renderer->driverdata; + + deprintf(1, "-PS3_ActivateRenderer()\n"); + return 0; +} + +static int SDL_PS3_DisplayModeChanged(SDL_Renderer * renderer) { + deprintf(1, "+PS3_DisplayModeChanged()\n"); + SDL_PS3_RenderData *data = (SDL_PS3_RenderData *) renderer->driverdata; + + deprintf(1, "-PS3_DisplayModeChanged()\n"); + return 0; +} + +static int +PS3_CreateTexture(SDL_Renderer * renderer, SDL_Texture * texture) { + deprintf(1, "+PS3_CreateTexture()\n"); + PS3_TextureData *data; + data = (PS3_TextureData *) SDL_calloc(1, sizeof(*data)); + if (!data) { + SDL_OutOfMemory(); + return -1; + } + data->pitch = (texture->w * SDL_BYTESPERPIXEL(texture->format)); + + if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) { + /* Use SDLs SW_YUVTexture */ + data->yuv = + SDL_SW_CreateYUVTexture(texture->format, texture->w, texture->h); + if (!data->yuv) { + SDL_OutOfMemory(); + return -1; + } + /* but align pixels */ + SDL_free(data->yuv->pixels); + data->yuv->pixels = (Uint8 *)memalign(16, texture->w * texture->h * 2); + if (!data->yuv->pixels) { + SDL_OutOfMemory(); + return -1; + } + + /* Redo: Find the pitch and offset values for the overlay */ + SDL_SW_YUVTexture *swdata = (SDL_SW_YUVTexture *) data->yuv; + switch (texture->format) { + case SDL_PIXELFORMAT_YV12: + case SDL_PIXELFORMAT_IYUV: + swdata->pitches[0] = texture->w; + swdata->pitches[1] = swdata->pitches[0] / 2; + swdata->pitches[2] = swdata->pitches[0] / 2; + swdata->planes[0] = swdata->pixels; + swdata->planes[1] = swdata->planes[0] + swdata->pitches[0] * texture->h; + swdata->planes[2] = swdata->planes[1] + swdata->pitches[1] * texture->h / 2; + break; + case SDL_PIXELFORMAT_YUY2: + case SDL_PIXELFORMAT_UYVY: + case SDL_PIXELFORMAT_YVYU: + swdata->pitches[0] = texture->w * 2; + swdata->planes[0] = swdata->pixels; + break; + default: + /* We should never get here (caught above) */ + break; + } + } else { + data->pixels = NULL; + data->pixels = SDL_malloc(texture->h * data->pitch); + if (!data->pixels) { + PS3_DestroyTexture(renderer, texture); + SDL_OutOfMemory(); + return -1; + } + } + texture->driverdata = data; + deprintf(1, "-PS3_CreateTexture()\n"); + return 0; +} + +static int +PS3_QueryTexturePixels(SDL_Renderer * renderer, SDL_Texture * texture, + void **pixels, int *pitch) +{ + deprintf(1, "+PS3_QueryTexturePixels()\n"); + PS3_TextureData *data = (PS3_TextureData *) texture->driverdata; + + if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) { + return SDL_SW_QueryYUVTexturePixels(data->yuv, pixels, pitch); + } else { + *pixels = (void *)data->pixels; + *pitch = data->pitch; + } + + deprintf(1, "-PS3_QueryTexturePixels()\n"); + return 0; +} + +static int +PS3_UpdateTexture(SDL_Renderer * renderer, SDL_Texture * texture, + const SDL_Rect * rect, const void *pixels, int pitch) +{ + deprintf(1, "+PS3_UpdateTexture()\n"); + PS3_TextureData *data = (PS3_TextureData *) texture->driverdata; + + if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) { + return SDL_SW_UpdateYUVTexture(data->yuv, rect, pixels, pitch); + } else { + Uint8 *src, *dst; + int row; + size_t length; + Uint8 *dstpixels; + + src = (Uint8 *) pixels; + dst = (Uint8 *) dstpixels + rect->y * data->pitch + rect->x + * SDL_BYTESPERPIXEL(texture->format); + length = rect->w * SDL_BYTESPERPIXEL(texture->format); + /* Update the texture */ + for (row = 0; row < rect->h; ++row) { + SDL_memcpy(dst, src, length); + src += pitch; + dst += data->pitch; + } + } + deprintf(1, "-PS3_UpdateTexture()\n"); + return 0; +} + +static int +PS3_LockTexture(SDL_Renderer * renderer, SDL_Texture * texture, + const SDL_Rect * rect, int markDirty, void **pixels, + int *pitch) +{ + deprintf(1, "+PS3_LockTexture()\n"); + PS3_TextureData *data = (PS3_TextureData *) texture->driverdata; + + if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) { + deprintf(1, "-PS3_LockTexture()\n"); + return SDL_SW_LockYUVTexture(data->yuv, rect, markDirty, pixels, pitch); + } else { + *pixels = + (void *) ((Uint8 *) data->pixels + rect->y * data->pitch + + rect->x * SDL_BYTESPERPIXEL(texture->format)); + *pitch = data->pitch; + deprintf(1, "-PS3_LockTexture()\n"); + return 0; + } +} + +static void +PS3_UnlockTexture(SDL_Renderer * renderer, SDL_Texture * texture) +{ + deprintf(1, "+PS3_UnlockTexture()\n"); + PS3_TextureData *data = (PS3_TextureData *) texture->driverdata; + + if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) { + SDL_SW_UnlockYUVTexture(data->yuv); + } + deprintf(1, "-PS3_UnlockTexture()\n"); +} + +static void +PS3_DestroyTexture(SDL_Renderer * renderer, SDL_Texture * texture) +{ + deprintf(1, "+PS3_DestroyTexture()\n"); + PS3_TextureData *data = (PS3_TextureData *) texture->driverdata; + + if (!data) { + return; + } + if (data->yuv) { + SDL_SW_DestroyYUVTexture(data->yuv); + } + if (data->pixels) { + SDL_free((void *)data->pixels); + } + deprintf(1, "-PS3_DestroyTexture()\n"); +} + +static int +SDL_PS3_RenderPoint(SDL_Renderer * renderer, int x, int y) +{ + SDL_PS3_RenderData *data = + (SDL_PS3_RenderData *) renderer->driverdata; + SDL_Surface *target = data->screen; + int status; + + if (renderer->blendMode == SDL_BLENDMODE_NONE || + renderer->blendMode == SDL_BLENDMODE_MASK) { + Uint32 color = + SDL_MapRGBA(target->format, renderer->r, renderer->g, renderer->b, + renderer->a); + + status = SDL_DrawPoint(target, x, y, color); + } else { + status = + SDL_BlendPoint(target, x, y, renderer->blendMode, renderer->r, + renderer->g, renderer->b, renderer->a); + } + return status; +} + +static int +SDL_PS3_RenderLine(SDL_Renderer * renderer, int x1, int y1, int x2, int y2) +{ + SDL_PS3_RenderData *data = + (SDL_PS3_RenderData *) renderer->driverdata; + SDL_Surface *target = data->screen; + int status; + + if (renderer->blendMode == SDL_BLENDMODE_NONE || + renderer->blendMode == SDL_BLENDMODE_MASK) { + Uint32 color = + SDL_MapRGBA(target->format, renderer->r, renderer->g, renderer->b, + renderer->a); + + status = SDL_DrawLine(target, x1, y1, x2, y2, color); + } else { + status = + SDL_BlendLine(target, x1, y1, x2, y2, renderer->blendMode, + renderer->r, renderer->g, renderer->b, renderer->a); + } + return status; +} + +static int +SDL_PS3_RenderFill(SDL_Renderer * renderer, const SDL_Rect * rect) +{ + deprintf(1, "SDL_PS3_RenderFill()\n"); + SDL_PS3_RenderData *data = + (SDL_PS3_RenderData *) renderer->driverdata; + SDL_Surface *target = data->screen; + SDL_Rect real_rect = *rect; + int status; + + if (renderer->blendMode == SDL_BLENDMODE_NONE) { + Uint32 color = + SDL_MapRGBA(target->format, renderer->r, renderer->g, renderer->b, + renderer->a); + + status = SDL_FillRect(target, &real_rect, color); + } else { + status = + SDL_BlendRect(target, &real_rect, renderer->blendMode, + renderer->r, renderer->g, renderer->b, renderer->a); + } + return status; +} + +static int +SDL_PS3_RenderCopy(SDL_Renderer * renderer, SDL_Texture * texture, + const SDL_Rect * srcrect, const SDL_Rect * dstrect) +{ + deprintf(1, "+SDL_PS3_RenderCopy()\n"); + SDL_PS3_RenderData *data = + (SDL_PS3_RenderData *) renderer->driverdata; + SDL_Window *window = SDL_GetWindowFromID(renderer->window); + SDL_VideoDisplay *display = SDL_GetDisplayFromWindow(window); + PS3_TextureData *txdata = (PS3_TextureData *) texture->driverdata; + SDL_VideoData *devdata = display->device->driverdata; + + if (SDL_ISPIXELFORMAT_FOURCC(texture->format)) { + deprintf(1, "Texture is in a FOURCC format\n"); + if ((texture->format == SDL_PIXELFORMAT_YV12 || texture->format == SDL_PIXELFORMAT_IYUV) + && texture->w % 8 == 0 && texture->h % 8 == 0 + && dstrect->w % 8 == 0 && dstrect->h % 8 == 0) { + deprintf(1, "Use SPE for scaling/converting\n"); + + SDL_SW_YUVTexture *swdata = (SDL_SW_YUVTexture *) txdata->yuv; + Uint8 *lum, *Cr, *Cb; + Uint8 *scaler_out = NULL; + Uint8 *dstpixels; + switch (texture->format) { + case SDL_PIXELFORMAT_YV12: + lum = swdata->planes[0]; + Cr = swdata->planes[1]; + Cb = swdata->planes[2]; + break; + case SDL_PIXELFORMAT_IYUV: + lum = swdata->planes[0]; + Cr = swdata->planes[2]; + Cb = swdata->planes[1]; + break; + default: + /* We should never get here (caught above) */ + return -1; + } + + if (srcrect->w != dstrect->w || srcrect->h != dstrect->h) { + deprintf(1, "We need to scale the texture from %u x %u to %u x %u\n", + srcrect->w, srcrect->h, dstrect->w, dstrect->h); + /* Alloc mem for scaled YUV picture */ + scaler_out = (Uint8 *) memalign(16, dstrect->w * dstrect->h + ((dstrect->w * dstrect->h) >> 1)); + if (scaler_out == NULL) { + SDL_OutOfMemory(); + return -1; + } + + /* Set parms for scaling */ + data->scaler_parms->src_pixel_width = srcrect->w; + data->scaler_parms->src_pixel_height = srcrect->h; + data->scaler_parms->dst_pixel_width = dstrect->w; + data->scaler_parms->dst_pixel_height = dstrect->h; + data->scaler_parms->y_plane = lum; + data->scaler_parms->v_plane = Cr; + data->scaler_parms->u_plane = Cb; + data->scaler_parms->dstBuffer = scaler_out; + data->scaler_thread_data->argp = (void *)data->scaler_parms; + + /* Scale the YUV overlay to given size */ + SPE_Start(data->scaler_thread_data); + SPE_Stop(data->scaler_thread_data); + + /* Set parms for converting after scaling */ + data->converter_parms->y_plane = scaler_out; + data->converter_parms->v_plane = scaler_out + dstrect->w * dstrect->h; + data->converter_parms->u_plane = scaler_out + dstrect->w * dstrect->h + ((dstrect->w * dstrect->h) >> 2); + } else { + data->converter_parms->y_plane = lum; + data->converter_parms->v_plane = Cr; + data->converter_parms->u_plane = Cb; + } + + dstpixels = (Uint8 *) data->screen->pixels + dstrect->y * data->screen->pitch + dstrect->x + * SDL_BYTESPERPIXEL(texture->format); + data->converter_parms->src_pixel_width = dstrect->w; + data->converter_parms->src_pixel_height = dstrect->h; + data->converter_parms->dstBuffer = dstpixels/*(Uint8 *)data->screen->pixels*/; + data->converter_thread_data->argp = (void *)data->converter_parms; + + /* Convert YUV texture to RGB */ + SPE_SendMsg(data->converter_thread_data, SPU_START); + SPE_SendMsg(data->converter_thread_data, (unsigned int)data->converter_thread_data->argp); + + /* We can probably move that to RenderPresent() */ + SPE_WaitForMsg(data->converter_thread_data, SPU_FIN); + if (scaler_out) { + free(scaler_out); + } + } else { + deprintf(1, "Use software for scaling/converting\n"); + Uint8 *dst; + /* FIXME: Not good */ + dst = (Uint8 *) data->screen->pixels + dstrect->y * data->screen->pitch + dstrect->x + * SDL_BYTESPERPIXEL(texture->format); + return SDL_SW_CopyYUVToRGB(txdata->yuv, srcrect, display->current_mode.format, + dstrect->w, dstrect->h, dst/*data->screen->pixels*/, + data->screen->pitch); + } + } else { + deprintf(1, "SDL_ISPIXELFORMAT_FOURCC = false\n"); + + Uint8 *src, *dst; + int row; + size_t length; + Uint8 *dstpixels; + + src = (Uint8 *) txdata->pixels; + dst = (Uint8 *) data->screen->pixels + dstrect->y * data->screen->pitch + dstrect->x + * SDL_BYTESPERPIXEL(texture->format); + length = dstrect->w * SDL_BYTESPERPIXEL(texture->format); + for (row = 0; row < dstrect->h; ++row) { + SDL_memcpy(dst, src, length); + src += txdata->pitch; + dst += data->screen->pitch; + } + } + + deprintf(1, "-SDL_PS3_RenderCopy()\n"); + return 0; +} + +static void +SDL_PS3_RenderPresent(SDL_Renderer * renderer) +{ + deprintf(1, "+SDL_PS3_RenderPresent()\n"); + SDL_PS3_RenderData *data = + (SDL_PS3_RenderData *) renderer->driverdata; + SDL_Window *window = SDL_GetWindowFromID(renderer->window); + SDL_VideoDisplay *display = SDL_GetDisplayFromWindow(window); + SDL_VideoData *devdata = display->device->driverdata; + + /* Send the data to the screen */ + /* Get screeninfo */ + struct fb_fix_screeninfo fb_finfo; + if (ioctl(devdata->fbdev, FBIOGET_FSCREENINFO, &fb_finfo)) { + SDL_SetError("[PS3] Can't get fixed screeninfo"); + } + struct fb_var_screeninfo fb_vinfo; + if (ioctl(devdata->fbdev, FBIOGET_VSCREENINFO, &fb_vinfo)) { + SDL_SetError("[PS3] Can't get VSCREENINFO"); + } + + /* 16 and 15 bpp is reported as 16 bpp */ + //txdata->bpp = fb_vinfo.bits_per_pixel; + //if (txdata->bpp == 16) + // txdata->bpp = fb_vinfo.red.length + fb_vinfo.green.length + fb_vinfo.blue.length; + + /* Adjust centering */ + data->bounded_width = window->w < fb_vinfo.xres ? window->w : fb_vinfo.xres; + data->bounded_height = window->h < fb_vinfo.yres ? window->h : fb_vinfo.yres; + /* We could use SDL's CENTERED flag for centering */ + data->offset_left = (fb_vinfo.xres - data->bounded_width) >> 1; + data->offset_top = (fb_vinfo.yres - data->bounded_height) >> 1; + data->center[0] = devdata->frame_buffer + data->offset_left * /*txdata->bpp/8*/ 4 + + data->offset_top * fb_finfo.line_length; + data->center[1] = data->center[0] + fb_vinfo.yres * fb_finfo.line_length; + + deprintf(1, "offset_left = %u\n", data->offset_left); + deprintf(1, "offset_top = %u\n", data->offset_top); + + /* Set SPU parms for copying the surface to framebuffer */ + devdata->fb_parms->data = (unsigned char *)data->screen->pixels; + devdata->fb_parms->center = data->center[data->current_screen]; + devdata->fb_parms->out_line_stride = fb_finfo.line_length; + devdata->fb_parms->in_line_stride = window->w * /*txdata->bpp / 8*/4; + devdata->fb_parms->bounded_input_height = data->bounded_height; + devdata->fb_parms->bounded_input_width = data->bounded_width; + //devdata->fb_parms->fb_pixel_size = txdata->bpp / 8; + devdata->fb_parms->fb_pixel_size = 4;//SDL_BYTESPERPIXEL(window->format); + + deprintf(3, "[PS3->SPU] fb_thread_data->argp = 0x%x\n", devdata->fb_thread_data->argp); + + /* Copying.. */ + SPE_SendMsg(devdata->fb_thread_data, SPU_START); + SPE_SendMsg(devdata->fb_thread_data, (unsigned int)devdata->fb_thread_data->argp); + + SPE_WaitForMsg(devdata->fb_thread_data, SPU_FIN); + + /* Wait for vsync */ + if (renderer->info.flags & SDL_RENDERER_PRESENTVSYNC) { + unsigned long crt = 0; + deprintf(1, "[PS3] Wait for vsync\n"); + ioctl(devdata->fbdev, FBIO_WAITFORVSYNC, &crt); + } + + /* Page flip */ + deprintf(1, "[PS3] Page flip to buffer #%u 0x%x\n", data->current_screen, data->center[data->current_screen]); + ioctl(devdata->fbdev, PS3FB_IOCTL_FSEL, (unsigned long)&data->current_screen); + + /* Update the flipping chain, if any */ + if (data->double_buffering) { + data->current_screen = (data->current_screen + 1) % 2; + } + deprintf(1, "-SDL_PS3_RenderPresent()\n"); +} + +static void +SDL_PS3_DestroyRenderer(SDL_Renderer * renderer) +{ + deprintf(1, "+SDL_PS3_DestroyRenderer()\n"); + SDL_PS3_RenderData *data = + (SDL_PS3_RenderData *) renderer->driverdata; + int i; + + if (data) { + for (i = 0; i < SDL_arraysize(data->screen); ++i) { + if (data->screen) { + SDL_FreeSurface(data->screen); + } + } + + /* Shutdown SPE and release related resources */ + if (data->scaler_thread_data) { + free((void *)data->scaler_thread_data); + } + if (data->scaler_parms) { + free((void *)data->scaler_parms); + } + if (data->converter_thread_data) { + SPE_Shutdown(data->converter_thread_data); + free((void *)data->converter_thread_data); + } + if (data->converter_parms) { + free((void *)data->converter_parms); + } + + SDL_free(data); + } + SDL_free(renderer); + deprintf(1, "-SDL_PS3_DestroyRenderer()\n"); +} + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3render_c.h Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,29 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ +#include "SDL_config.h" + +/* Default framebuffer device on PS3 */ +/* SDL surface based renderer implementation */ + +extern SDL_RenderDriver SDL_PS3_RenderDriver; + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3spe.c Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,166 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ +#include "SDL_config.h" + +#include "SDL_video.h" +#include "SDL_ps3spe_c.h" + +#include "SDL_ps3video.h" +#include "SDL_ps3render_c.h" + +/* Start the SPE thread */ +int SPE_Start(spu_data_t * spe_data) +{ + deprintf(2, "[PS3->SPU] Start SPE: %s\n", spe_data->program_name); + if (!(spe_data->booted)) + SPE_Boot(spe_data); + + /* To allow re-running of context, spe_ctx_entry has to be set before each call */ + spe_data->entry = SPE_DEFAULT_ENTRY; + spe_data->error_code = 0; + + /* Create SPE thread and run */ + deprintf(2, "[PS3->SPU] Create Thread: %s\n", spe_data->program_name); + if (pthread_create + (&spe_data->thread, NULL, (void *)&SPE_RunContext, (void *)spe_data)) { + deprintf(2, "[PS3->SPU] Could not create pthread for spe: %s\n", spe_data->program_name); + SDL_SetError("[PS3->SPU] Could not create pthread for spe"); + return -1; + } + + if (spe_data->keepalive) + SPE_WaitForMsg(spe_data, SPU_READY); +} + +/* Stop the SPE thread */ +int SPE_Stop(spu_data_t * spe_data) +{ + deprintf(2, "[PS3->SPU] Stop SPE: %s\n", spe_data->program_name); + /* Wait for SPE thread to complete */ + deprintf(2, "[PS3->SPU] Wait for SPE thread to complete: %s\n", spe_data->program_name); + if (pthread_join(spe_data->thread, NULL)) { + deprintf(2, "[PS3->SPU] Failed joining the thread: %s\n", spe_data->program_name); + SDL_SetError("[PS3->SPU] Failed joining the thread"); + return -1; + } + + return 0; +} + +/* Create SPE context and load program */ +int SPE_Boot(spu_data_t * spe_data) +{ + /* Create SPE context */ + deprintf(2, "[PS3->SPU] Create SPE Context: %s\n", spe_data->program_name); + spe_data->ctx = spe_context_create(0, NULL); + if (spe_data->ctx == NULL) { + deprintf(2, "[PS3->SPU] Failed creating SPE context: %s\n", spe_data->program_name); + SDL_SetError("[PS3->SPU] Failed creating SPE context"); + return -1; + } + + /* Load SPE object into SPE local store */ + deprintf(2, "[PS3->SPU] Load Program into SPE: %s\n", spe_data->program_name); + if (spe_program_load(spe_data->ctx, &spe_data->program)) { + deprintf(2, "[PS3->SPU] Failed loading program into SPE context: %s\n", spe_data->program_name); + SDL_SetError + ("[PS3->SPU] Failed loading program into SPE context"); + return -1; + } + spe_data->booted = 1; + deprintf(2, "[PS3->SPU] SPE boot successful\n"); + + return 0; +} + +/* (Stop and) shutdown the SPE */ +int SPE_Shutdown(spu_data_t * spe_data) +{ + if (spe_data->keepalive && spe_data->booted) { + SPE_SendMsg(spe_data, SPU_EXIT); + SPE_Stop(spe_data); + } + + /* Destroy SPE context */ + deprintf(2, "[PS3->SPU] Destroy SPE context: %s\n", spe_data->program_name); + if (spe_context_destroy(spe_data->ctx)) { + deprintf(2, "[PS3->SPU] Failed destroying context: %s\n", spe_data->program_name); + SDL_SetError("[PS3->SPU] Failed destroying context"); + return -1; + } + deprintf(2, "[PS3->SPU] SPE shutdown successful: %s\n", spe_data->program_name); + return 0; +} + +/* Send message to the SPE via mailboxe */ +int SPE_SendMsg(spu_data_t * spe_data, unsigned int msg) +{ + deprintf(2, "[PS3->SPU] Sending message %u to %s\n", msg, spe_data->program_name); + /* Send one message, block until message was sent */ + unsigned int spe_in_mbox_msgs[1]; + spe_in_mbox_msgs[0] = msg; + int in_mbox_write = spe_in_mbox_write(spe_data->ctx, spe_in_mbox_msgs, 1, SPE_MBOX_ALL_BLOCKING); + + if (1 > in_mbox_write) { + deprintf(2, "[PS3->SPU] No message could be written to %s\n", spe_data->program_name); + SDL_SetError("[PS3->SPU] No message could be written"); + return -1; + } + return 0; +} + + +/* Read 1 message from SPE, block until at least 1 message was received */ +int SPE_WaitForMsg(spu_data_t * spe_data, unsigned int msg) +{ + deprintf(2, "[PS3->SPU] Waiting for message from %s\n", spe_data->program_name); + unsigned int out_messages[1]; + while (!spe_out_mbox_status(spe_data->ctx)); + int mbox_read = spe_out_mbox_read(spe_data->ctx, out_messages, 1); + deprintf(2, "[PS3->SPU] Got message from %s, message was %u\n", spe_data->program_name, out_messages[0]); + if (out_messages[0] == msg) + return 0; + else + return -1; +} + +/* Re-runnable invocation of the spe_context_run call */ +void SPE_RunContext(void *thread_argp) +{ + /* argp is the pointer to argument to be passed to the SPE program */ + spu_data_t *args = (spu_data_t *) thread_argp; + deprintf(3, "[PS3->SPU] void* argp=0x%x\n", (unsigned int)args->argp); + + /* Run it.. */ + deprintf(2, "[PS3->SPU] Run SPE program: %s\n", args->program_name); + if (spe_context_run + (args->ctx, &args->entry, 0, (void *)args->argp, NULL, + NULL) < 0) { + deprintf(2, "[PS3->SPU] Failed running SPE context: %s\n", args->program_name); + SDL_SetError("[PS3->SPU] Failed running SPE context: %s", args->program_name); + exit(1); + } + + pthread_exit(NULL); +} + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3spe_c.h Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,87 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ + +/* This SPE API basically provides 3 ways to run and control a program + * on the SPE: + * - Start and stop the program (keepalive=0). + * SPE_Start() will implicitly boot up the program, create a thread and run + * the context. + * SPE_Stop() will join the (terminated) thread (may block) and return. + * - Boot the program and run it (keepalive=0). + * SPE_Boot() will create a context and load the program and finally start + * the context with SPE_Start(). + * SPE_Stop() will savely end the program. + * - Boot, Run and send messages to the program (keepalive=1). + * Start the program by using one of the methods described above. When + * received the READY-message the program is in its infinite loop waiting + * for new messages. + * Every time you run the program, send SPU_START and the address of the + * according struct using SPE_SendMsg(). + * SPE_WaitForMsg() will than wait for SPU_FIN and is blocking. + * SPE_Shutdown() sends SPU_EXIT and finally stops the program. + * + * Therefor the SPE program + * - either runs once and returns + * - or runs in an infinite loop and is controlled by messages. + */ + +#include "SDL_config.h" + +#include "spulibs/spu_common.h" + +#include <libspe2.h> + +#ifndef _SDL_ps3spe_h +#define _SDL_ps3spe_h + +/* SPU handling data */ +typedef struct spu_data { + /* Context to be executed */ + spe_context_ptr_t ctx; + spe_program_handle_t program; + /* Thread running the context */ + pthread_t thread; + /* For debugging */ + char * program_name; + /* SPE_Start() or SPE_Boot() called */ + unsigned int booted; + /* Runs the program in an infinite loop? */ + unsigned int keepalive; + unsigned int entry; + /* Exit code of the program */ + int error_code; + /* Arguments passed to the program */ + void * argp; +} spu_data_t; + +/* SPU specific API functions */ +int SPE_Start(spu_data_t * spe_data); +int SPE_Stop(spu_data_t * spe_data); +int SPE_Boot(spu_data_t * spe_data); +int SPE_Shutdown(spu_data_t * spe_data); +int SPE_SendMsg(spu_data_t * spe_data, unsigned int msg); +int SPE_WaitForMsg(spu_data_t * spe_data, unsigned int msg); +void SPE_RunContext(void *thread_argp); + +#endif /* _SDL_ps3spe_h */ + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3video.c Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,224 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ +#include "SDL_config.h" + +/* SDL PS3 video driver implementation based on dummy video driver + * + * Initial work by Ryan C. Gordon (icculus@icculus.org). A good portion + * of this was cut-and-pasted from Stephane Peter's work in the AAlib + * SDL video driver. Renamed to "DUMMY" by Sam Lantinga. + */ + +#include "SDL_video.h" +#include "SDL_mouse.h" +#include "../SDL_sysvideo.h" +#include "../SDL_pixels_c.h" +#include "../../events/SDL_events_c.h" + +#include "SDL_ps3video.h" +#include "SDL_ps3spe_c.h" +#include "SDL_ps3events_c.h" +#include "SDL_ps3render_c.h" +#include "SDL_ps3modes_c.h" + +#include <fcntl.h> +#include <linux/fb.h> +#include <asm/ps3fb.h> +#include <sys/mman.h> + +#define PS3VID_DRIVER_NAME "ps3" + +/* Initialization/Query functions */ +static int PS3_VideoInit(_THIS); +static void PS3_VideoQuit(_THIS); + +/* Stores the SPE executable name of fb_writer_spu */ +extern spe_program_handle_t fb_writer_spu; + +/* PS3 driver bootstrap functions */ + +static int +PS3_Available(void) +{ + deprintf(1, "+PS3_Available()\n"); + const char *envr = SDL_getenv("SDL_VIDEODRIVER"); + if ((envr) && (SDL_strcmp(envr, PS3VID_DRIVER_NAME) == 0)) { + return (1); + } + + deprintf(1, "-PS3_Available()\n"); + return (0); +} + +static void +PS3_DeleteDevice(SDL_VideoDevice * device) +{ + deprintf(1, "+PS3_DeleteDevice()\n"); + SDL_free(device->driverdata); + SDL_free(device); + deprintf(1, "-PS3_DeleteDevice()\n"); +} + +static SDL_VideoDevice * +PS3_CreateDevice(int devindex) +{ + deprintf(1, "+PS3_CreateDevice()\n"); + SDL_VideoDevice *device; + SDL_VideoData *data; + + /* Initialize all variables that we clean on shutdown */ + device = (SDL_VideoDevice *) SDL_calloc(1, sizeof(SDL_VideoDevice)); + if (!device) { + SDL_OutOfMemory(); + if (device) { + SDL_free(device); + } + return (0); + } + data = (struct SDL_VideoData *) SDL_calloc(1, sizeof(SDL_VideoData)); + if (!data) { + SDL_OutOfMemory(); + SDL_free(device); + return (0); + } + device->driverdata = data; + + /* Set the function pointers */ + device->VideoInit = PS3_VideoInit; + device->VideoQuit = PS3_VideoQuit; + device->SetDisplayMode = PS3_SetDisplayMode; + device->GetDisplayModes = PS3_GetDisplayModes; + device->PumpEvents = PS3_PumpEvents; + + device->free = PS3_DeleteDevice; + + deprintf(1, "-PS3_CreateDevice()\n"); + return device; +} + +VideoBootStrap PS3_bootstrap = { + PS3VID_DRIVER_NAME, "SDL PS3 Cell video driver", + PS3_Available, PS3_CreateDevice +}; + + +int +PS3_VideoInit(_THIS) +{ + deprintf(1, "PS3_VideoInit()\n"); + + SDL_VideoData *data = (SDL_VideoData *) _this->driverdata; + SDL_DisplayMode mode; + + /* Create SPU fb_parms and thread structure */ + data->fb_parms = (struct fb_writer_parms_t *) + memalign(16, sizeof(struct fb_writer_parms_t)); + data->fb_thread_data = (spu_data_t *) malloc(sizeof(spu_data_t)); + if (data->fb_parms == NULL || data->fb_thread_data == NULL) { + SDL_OutOfMemory(); + return -1; + } + data->fb_thread_data->program = fb_writer_spu; + data->fb_thread_data->program_name = "fb_writer_spu"; + data->fb_thread_data->argp = (void *)data->fb_parms; + data->fb_thread_data->keepalive = 1; + data->fb_thread_data->booted = 0; + + SPE_Start(data->fb_thread_data); + + /* Open the device */ + data->fbdev = open(PS3DEV, O_RDWR); + if (data->fbdev < 0) { + SDL_SetError("[PS3] Unable to open device %s", PS3DEV); + return -1; + } + + /* Take control of frame buffer from kernel, for details see + * http://felter.org/wesley/files/ps3/linux-20061110-docs/ApplicationProgrammingEnvironment.html + * kernel will no longer flip the screen itself + */ + ioctl(data->fbdev, PS3FB_IOCTL_ON, 0); + + /* Unblank screen */ + ioctl(data->fbdev, FBIOBLANK, 0); + + struct fb_fix_screeninfo fb_finfo; + if (ioctl(data->fbdev, FBIOGET_FSCREENINFO, &fb_finfo)) { + SDL_SetError("[PS3] Can't get fixed screeninfo"); + return (0); + } + + /* Note: on PS3, fb_finfo.smem_len is enough for double buffering */ + if ((data->frame_buffer = (uint8_t *)mmap(0, fb_finfo.smem_len, + PROT_READ | PROT_WRITE, MAP_SHARED, + data->fbdev, 0)) == (uint8_t *) - 1) { + SDL_SetError("[PS3] Can't mmap for %s", PS3DEV); + return (0); + } else { + /* Enable double buffering */ + } + + /* Blank screen */ + memset(data->frame_buffer, 0x00, fb_finfo.smem_len); + + PS3_InitModes(_this); + SDL_AddRenderDriver(0, &SDL_PS3_RenderDriver); + + /* We're done! */ + return 0; +} + +void +PS3_VideoQuit(_THIS) +{ + deprintf(1, "PS3_VideoQuit()\n"); + SDL_VideoData *data = (SDL_VideoData *) _this->driverdata; + + PS3_QuitModes(_this); + + /* Unmap framebuffer */ + if (data->frame_buffer) { + struct fb_fix_screeninfo fb_finfo; + if (ioctl(data->fbdev, FBIOGET_FSCREENINFO, &fb_finfo) != -1) { + munmap(data->frame_buffer, fb_finfo.smem_len); + data->frame_buffer = 0; + } + } + + /* Shutdown SPE and related resources */ + if (data->fb_parms) + free((void *)data->fb_parms); + if (data->fb_thread_data) { + SPE_Shutdown(data->fb_thread_data); + free((void *)data->fb_thread_data); + } + + /* Close device */ + if (data->fbdev) { + /* Give control of frame buffer back to kernel */ + ioctl(data->fbdev, PS3FB_IOCTL_OFF, 0); + close(data->fbdev); + data->fbdev = -1; + } +} + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/SDL_ps3video.h Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,79 @@ +/* + SDL - Simple DirectMedia Layer + Copyright (C) 1997-2009 Sam Lantinga + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + Sam Lantinga + slouken@libsdl.org +*/ +#include "SDL_config.h" + +#ifndef _SDL_ps3video_h +#define _SDL_ps3video_h + +#include "../SDL_sysvideo.h" +#include "SDL_ps3spe_c.h" + +#include <linux/fb.h> +#include <asm/ps3fb.h> + +/* Debugging + * 0: No debug messages + * 1: Video debug messages + * 2: SPE debug messages + * 3: Memory adresses + */ +#define DEBUG_LEVEL 0 + +#ifdef DEBUG_LEVEL +#define deprintf( level, fmt, args... ) \ + do \ +{ \ + if ( (unsigned)(level) <= DEBUG_LEVEL ) \ + { \ + fprintf( stdout, fmt, ##args ); \ + fflush( stdout ); \ + } \ +} while ( 0 ) +#else +#define deprintf( level, fmt, args... ) +#endif + +/* Default framebuffer device on PS3 */ +#define PS3DEV "/dev/fb0" + +/* Private display data */ +typedef struct SDL_VideoData +{ + /* Framebuffer device descriptor */ + int fbdev; + /* mmap'd access to fbdev */ + uint8_t * frame_buffer; + /* SPE threading stuff of the framebuffer */ + spu_data_t * fb_thread_data; + /* Framebuffer transfer data */ + volatile struct fb_writer_parms_t * fb_parms __attribute__((aligned(128))); +} SDL_VideoData; + +typedef struct SDL_DisplayModeData +{ + unsigned long mode; + //struct ps3fb_ioctl_res res; +} PS3_DisplayModeData; + +#endif /* _SDL_ps3video_h */ + +/* vi: set ts=4 sw=4 expandtab: */
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/spulibs/Makefile Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,47 @@ +# This Makefile is for building the CELL BE SPU libs +# libfb_writer_spu.so, libyuv2rgb_spu.so, libbilin_scaler_spu.so + +# Toolchain +PPU_LD=/usr/bin/ld +SPU_SRCDIR=$(srcdir)/src/video/ps3/spulibs +SPU_LIBDIR=$(srcdir)/src/video/ps3/spulibs/libs +SPU_CFLAGS=-g -W -Wall -Winline -Wno-main -I. -I /usr/spu/include -I /opt/cell/sdk/usr/spu/include -finline-limit=10000 -Winline -ftree-vectorize -funroll-loops -fmodulo-sched -ffast-math -fPIC -O2 + +DEPS = $(SPU_SRCDIR)/spu_common.h +LIBS= fb_writer yuv2rgb bilin_scaler + +OBJLIBS = $(foreach lib,$(LIBS),lib$(lib)_spu.a) +SHALIBS = $(foreach lib,$(LIBS),lib$(lib)_spu.so) + + +ps3libs: $(foreach lib,$(OBJLIBS),$(SPU_LIBDIR)/$(lib)) $(foreach lib,$(SHALIBS),$(SPU_LIBDIR)/$(lib)) + + +$(SPU_LIBDIR)/lib%_spu.a: $(SPU_LIBDIR)/%-embed.o + $(AR) -qcs $@ $< + +$(SPU_LIBDIR)/lib%_spu.so: $(SPU_LIBDIR)/%-embed.o + $(PPU_LD) -o $@ -shared -soname=$(notdir $@) $< + +$(SPU_LIBDIR)/%-embed.o: $(SPU_LIBDIR)/%.o + $(EMBEDSPU) -m32 $(subst -embed.o,,$(notdir $@))_spu $< $@ + +$(SPU_LIBDIR)/%.o: $(SPU_SRCDIR)/%.c $(DEPS) + $(SPU_GCC) $(SPU_CFLAGS) -o $@ $< -lm + + +ps3libs-install: $(foreach obj,$(OBJLIBS),$(SPU_LIBDIR)/$(obj)) $(foreach obj,$(SHALIBS),$(SPU_LIBDIR)/$(obj)) + for file in $(OBJLIBS); do \ + $(INSTALL) -c -m 0655 $(SPU_LIBDIR)/$$file $(DESTDIR)$(libdir)/$$file; \ + done + for file in $(SHALIBS); do \ + $(INSTALL) -c -m 0755 $(SPU_LIBDIR)/$$file $(DESTDIR)$(libdir)/$$file; \ + done + +ps3libs-uninstall: + for file in $(OBJLIBS) $(SHALIBS); do \ + rm -f $(DESTDIR)$(libdir)/$$file; \ + done + +ps3libs-clean: + rm -f $(SPU_LIBDIR)/*
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/spulibs/bilin_scaler.c Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,2050 @@ +/* + * SDL - Simple DirectMedia Layer + * CELL BE Support for PS3 Framebuffer + * Copyright (C) 2008, 2009 International Business Machines Corporation + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + * + * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com> + * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com> + * SPE code based on research by: + * Rene Becker + * Thimo Emmerich + */ + +#include "spu_common.h" + +#include <spu_intrinsics.h> +#include <spu_mfcio.h> + +// Debugging +//#define DEBUG + +#ifdef DEBUG +#define deprintf(fmt, args... ) \ + fprintf( stdout, fmt, ##args ); \ + fflush( stdout ); +#else +#define deprintf( fmt, args... ) +#endif + +struct scale_parms_t parms __attribute__((aligned(128))); + +/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored + * there might be the need to retrieve misaligned data, adjust + * incoming v and u plane to be able to handle this (add 128) + */ +unsigned char y_plane[2][(MAX_HDTV_WIDTH+128)*4] __attribute__((aligned(128))); +unsigned char v_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128))); +unsigned char u_plane[2][(MAX_HDTV_WIDTH+128)*2] __attribute__((aligned(128))); + +/* temp-buffer for scaling: 4 lines Y, therefore 2 lines V, 2 lines U */ +unsigned char scaled_y_plane[2][MAX_HDTV_WIDTH*2] __attribute__((aligned(128))); +unsigned char scaled_v_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128))); +unsigned char scaled_u_plane[2][MAX_HDTV_WIDTH/2] __attribute__((aligned(128))); + +/* some vectors needed by the float to int conversion */ +static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f }; +static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f }; + +void bilinear_scale_line_w8(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride); +void bilinear_scale_line_w16(unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride); + +void scale_srcw16_dstw16(); +void scale_srcw16_dstw32(); +void scale_srcw32_dstw16(); +void scale_srcw32_dstw32(); + +int main( unsigned long long spe_id __attribute__((unused)), unsigned long long argp ) +{ + deprintf("[SPU] bilin_scaler_spu is up... (on SPE #%llu)\n", spe_id); + /* DMA transfer for the input parameters */ + spu_mfcdma32(&parms, (unsigned int)argp, sizeof(struct scale_parms_t), TAG_INIT, MFC_GET_CMD); + DMA_WAIT_TAG(TAG_INIT); + + deprintf("[SPU] Scale %ux%u to %ux%u\n", parms.src_pixel_width, parms.src_pixel_height, + parms.dst_pixel_width, parms.dst_pixel_height); + + if(parms.src_pixel_width & 0x1f) { + if(parms.dst_pixel_width & 0x1F) { + deprintf("[SPU] Using scale_srcw16_dstw16\n"); + scale_srcw16_dstw16(); + } else { + deprintf("[SPU] Using scale_srcw16_dstw32\n"); + scale_srcw16_dstw32(); + } + } else { + if(parms.dst_pixel_width & 0x1F) { + deprintf("[SPU] Using scale_srcw32_dstw16\n"); + scale_srcw32_dstw16(); + } else { + deprintf("[SPU] Using scale_srcw32_dstw32\n"); + scale_srcw32_dstw32(); + } + } + deprintf("[SPU] bilin_scaler_spu... done!\n"); + + return 0; +} + + +/* + * vfloat_to_vuint() + * + * converts a float vector to an unsinged int vector using saturated + * arithmetic + * + * @param vec_s float vector for conversion + * @returns converted unsigned int vector + */ +inline static vector unsigned int vfloat_to_vuint(vector float vec_s) { + vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); + vec_s = spu_sel(vec_s, vec_0_1, select_1); + + vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); + vec_s = spu_sel(vec_s, vec_255, select_2); + return spu_convtu(vec_s,0); +} + + +/* + * scale_srcw16_dstw16() + * + * processes an input image of width 16 + * scaling is done to a width 16 + * result stored in RAM + */ +void scale_srcw16_dstw16() { + // extract parameters + unsigned char* dst_addr = (unsigned char *)parms.dstBuffer; + + unsigned int src_width = parms.src_pixel_width; + unsigned int src_height = parms.src_pixel_height; + unsigned int dst_width = parms.dst_pixel_width; + unsigned int dst_height = parms.dst_pixel_height; + + // YVU + unsigned int src_linestride_y = src_width; + unsigned int src_dbl_linestride_y = src_width<<1; + unsigned int src_linestride_vu = src_width>>1; + unsigned int src_dbl_linestride_vu = src_width; + + // scaled YVU + unsigned int scaled_src_linestride_y = dst_width; + + // ram addresses + unsigned char* src_addr_y = parms.y_plane; + unsigned char* src_addr_v = parms.v_plane; + unsigned char* src_addr_u = parms.u_plane; + + // for handling misalignment, addresses are precalculated + unsigned char* precalc_src_addr_v = src_addr_v; + unsigned char* precalc_src_addr_u = src_addr_u; + + unsigned int dst_picture_size = dst_width*dst_height; + + // Sizes for destination + unsigned int dst_dbl_linestride_y = dst_width<<1; + unsigned int dst_dbl_linestride_vu = dst_width>>1; + + // Perform address calculation for Y, V and U in main memory with dst_addr as base + unsigned char* dst_addr_main_memory_y = dst_addr; + unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size; + unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2); + + // calculate scale factors + vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width ); + float y_scale = (float)src_height/(float)dst_height; + + // double buffered processing + // buffer switching + unsigned int curr_src_idx = 0; + unsigned int curr_dst_idx = 0; + unsigned int next_src_idx, next_dst_idx; + + // 2 lines y as output, upper and lowerline + unsigned int curr_interpl_y_upper = 0; + unsigned int next_interpl_y_upper; + unsigned int curr_interpl_y_lower, next_interpl_y_lower; + // only 1 line v/u output, both planes have the same dimension + unsigned int curr_interpl_vu = 0; + unsigned int next_interpl_vu; + + // weights, calculated in every loop iteration + vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_y_upper; + vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower; + vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_vu; + + // line indices for the src picture + float curr_src_y_upper = 0.0f, next_src_y_upper; + float curr_src_y_lower, next_src_y_lower; + float curr_src_vu = 0.0f, next_src_vu; + + // line indices for the dst picture + unsigned int dst_y=0, dst_vu=0; + + // offset for the v and u plane to handle misalignement + unsigned int curr_lsoff_v = 0, next_lsoff_v; + unsigned int curr_lsoff_u = 0, next_lsoff_u; + + // calculate lower line indices + curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale; + curr_interpl_y_lower = (unsigned int)curr_src_y_lower; + // lower line weight + vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower ); + + + // start partially double buffered processing + // get initial data, 2 sets of y, 1 set v, 1 set u + mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 ); + mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF, + 0, 0 ); + mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + + /* iteration loop + * within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved + * the scaled output is 2 lines y, 1 line v, 1 line u + * the yuv2rgb-converted output is stored to RAM + */ + for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) { + dst_y = dst_vu<<1; + + // calculate next indices + next_src_vu = ((float)dst_vu+1)*y_scale; + next_src_y_upper = ((float)dst_y+2)*y_scale; + next_src_y_lower = ((float)dst_y+3)*y_scale; + + next_interpl_vu = (unsigned int) next_src_vu; + next_interpl_y_upper = (unsigned int) next_src_y_upper; + next_interpl_y_lower = (unsigned int) next_src_y_lower; + + // calculate weight NORTH-SOUTH + vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu ); + vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper ); + vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower ); + + // get next lines + next_src_idx = curr_src_idx^1; + next_dst_idx = curr_dst_idx^1; + + // 4 lines y + mfc_get( y_plane[next_src_idx], + (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + + // 2 lines v + precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu); + next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F; + mfc_get( v_plane[next_src_idx], + ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0, + src_dbl_linestride_vu+(next_lsoff_v<<1), + RETR_BUF+next_src_idx, + 0, 0 ); + // 2 lines u + precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu); + next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F; + mfc_get( u_plane[next_src_idx], + ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0, + src_dbl_linestride_vu+(next_lsoff_v<<1), + RETR_BUF+next_src_idx, + 0, 0 ); + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v, + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u, + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + // Store the result back to main memory into a destination buffer in YUV format + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + //--------------------------------------------------------------------------------------------- + + + // update for next cycle + curr_src_idx = next_src_idx; + curr_dst_idx = next_dst_idx; + + curr_interpl_y_upper = next_interpl_y_upper; + curr_interpl_y_lower = next_interpl_y_lower; + curr_interpl_vu = next_interpl_vu; + + vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper; + vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower; + vf_curr_NSweight_vu = vf_next_NSweight_vu; + + curr_src_y_upper = next_src_y_upper; + curr_src_y_lower = next_src_y_lower; + curr_src_vu = next_src_vu; + + curr_lsoff_v = next_lsoff_v; + curr_lsoff_u = next_lsoff_u; + } + + + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v, + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u, + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + // Store the result back to main memory into a destination buffer in YUV format + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int)dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + // wait for completion + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + //--------------------------------------------------------------------------------------------- +} + + +/* + * scale_srcw16_dstw32() + * + * processes an input image of width 16 + * scaling is done to a width 32 + * yuv2rgb conversion on a width of 32 + * result stored in RAM + */ +void scale_srcw16_dstw32() { + // extract parameters + unsigned char* dst_addr = (unsigned char *)parms.dstBuffer; + + unsigned int src_width = parms.src_pixel_width; + unsigned int src_height = parms.src_pixel_height; + unsigned int dst_width = parms.dst_pixel_width; + unsigned int dst_height = parms.dst_pixel_height; + + // YVU + unsigned int src_linestride_y = src_width; + unsigned int src_dbl_linestride_y = src_width<<1; + unsigned int src_linestride_vu = src_width>>1; + unsigned int src_dbl_linestride_vu = src_width; + // scaled YVU + unsigned int scaled_src_linestride_y = dst_width; + + // ram addresses + unsigned char* src_addr_y = parms.y_plane; + unsigned char* src_addr_v = parms.v_plane; + unsigned char* src_addr_u = parms.u_plane; + + unsigned int dst_picture_size = dst_width*dst_height; + + // Sizes for destination + unsigned int dst_dbl_linestride_y = dst_width<<1; + unsigned int dst_dbl_linestride_vu = dst_width>>1; + + // Perform address calculation for Y, V and U in main memory with dst_addr as base + unsigned char* dst_addr_main_memory_y = dst_addr; + unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size; + unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2); + + + // for handling misalignment, addresses are precalculated + unsigned char* precalc_src_addr_v = src_addr_v; + unsigned char* precalc_src_addr_u = src_addr_u; + + // calculate scale factors + vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width ); + float y_scale = (float)src_height/(float)dst_height; + + // double buffered processing + // buffer switching + unsigned int curr_src_idx = 0; + unsigned int curr_dst_idx = 0; + unsigned int next_src_idx, next_dst_idx; + + // 2 lines y as output, upper and lowerline + unsigned int curr_interpl_y_upper = 0; + unsigned int next_interpl_y_upper; + unsigned int curr_interpl_y_lower, next_interpl_y_lower; + // only 1 line v/u output, both planes have the same dimension + unsigned int curr_interpl_vu = 0; + unsigned int next_interpl_vu; + + // weights, calculated in every loop iteration + vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_y_upper; + vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower; + vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_vu; + + // line indices for the src picture + float curr_src_y_upper = 0.0f, next_src_y_upper; + float curr_src_y_lower, next_src_y_lower; + float curr_src_vu = 0.0f, next_src_vu; + + // line indices for the dst picture + unsigned int dst_y=0, dst_vu=0; + + // offset for the v and u plane to handle misalignement + unsigned int curr_lsoff_v = 0, next_lsoff_v; + unsigned int curr_lsoff_u = 0, next_lsoff_u; + + // calculate lower line idices + curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale; + curr_interpl_y_lower = (unsigned int)curr_src_y_lower; + // lower line weight + vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower ); + + + // start partially double buffered processing + // get initial data, 2 sets of y, 1 set v, 1 set u + mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 ); + mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF, + 0, 0 ); + mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + + // iteration loop + // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved + // the scaled output is 2 lines y, 1 line v, 1 line u + // the yuv2rgb-converted output is stored to RAM + for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) { + dst_y = dst_vu<<1; + + // calculate next indices + next_src_vu = ((float)dst_vu+1)*y_scale; + next_src_y_upper = ((float)dst_y+2)*y_scale; + next_src_y_lower = ((float)dst_y+3)*y_scale; + + next_interpl_vu = (unsigned int) next_src_vu; + next_interpl_y_upper = (unsigned int) next_src_y_upper; + next_interpl_y_lower = (unsigned int) next_src_y_lower; + + // calculate weight NORTH-SOUTH + vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu ); + vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper ); + vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower ); + + // get next lines + next_src_idx = curr_src_idx^1; + next_dst_idx = curr_dst_idx^1; + + // 4 lines y + mfc_get( y_plane[next_src_idx], + (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + + // 2 lines v + precalc_src_addr_v = src_addr_v+(next_interpl_vu*src_linestride_vu); + next_lsoff_v = ((unsigned int)precalc_src_addr_v)&0x0F; + mfc_get( v_plane[next_src_idx], + ((unsigned int) precalc_src_addr_v)&0xFFFFFFF0, + src_dbl_linestride_vu+(next_lsoff_v<<1), + RETR_BUF+next_src_idx, + 0, 0 ); + // 2 lines u + precalc_src_addr_u = src_addr_u+(next_interpl_vu*src_linestride_vu); + next_lsoff_u = ((unsigned int)precalc_src_addr_u)&0x0F; + mfc_get( u_plane[next_src_idx], + ((unsigned int) precalc_src_addr_u)&0xFFFFFFF0, + src_dbl_linestride_vu+(next_lsoff_v<<1), + RETR_BUF+next_src_idx, + 0, 0 ); + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v, + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u, + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + //--------------------------------------------------------------------------------------------- + + + // update for next cycle + curr_src_idx = next_src_idx; + curr_dst_idx = next_dst_idx; + + curr_interpl_y_upper = next_interpl_y_upper; + curr_interpl_y_lower = next_interpl_y_lower; + curr_interpl_vu = next_interpl_vu; + + vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper; + vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower; + vf_curr_NSweight_vu = vf_next_NSweight_vu; + + curr_src_y_upper = next_src_y_upper; + curr_src_y_lower = next_src_y_lower; + curr_src_vu = next_src_vu; + + curr_lsoff_v = next_lsoff_v; + curr_lsoff_u = next_lsoff_u; + } + + + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w8( v_plane[curr_src_idx]+curr_lsoff_v, + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w8( u_plane[curr_src_idx]+curr_lsoff_u, + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + // wait for completion + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + //--------------------------------------------------------------------------------------------- +} + + +/* + * scale_srcw32_dstw16() + * + * processes an input image of width 32 + * scaling is done to a width 16 + * yuv2rgb conversion on a width of 16 + * result stored in RAM + */ +void scale_srcw32_dstw16() { + // extract parameters + unsigned char* dst_addr = (unsigned char *)parms.dstBuffer; + + unsigned int src_width = parms.src_pixel_width; + unsigned int src_height = parms.src_pixel_height; + unsigned int dst_width = parms.dst_pixel_width; + unsigned int dst_height = parms.dst_pixel_height; + + // YVU + unsigned int src_linestride_y = src_width; + unsigned int src_dbl_linestride_y = src_width<<1; + unsigned int src_linestride_vu = src_width>>1; + unsigned int src_dbl_linestride_vu = src_width; + // scaled YVU + unsigned int scaled_src_linestride_y = dst_width; + + // ram addresses + unsigned char* src_addr_y = parms.y_plane; + unsigned char* src_addr_v = parms.v_plane; + unsigned char* src_addr_u = parms.u_plane; + + unsigned int dst_picture_size = dst_width*dst_height; + + // Sizes for destination + unsigned int dst_dbl_linestride_y = dst_width<<1; + unsigned int dst_dbl_linestride_vu = dst_width>>1; + + // Perform address calculation for Y, V and U in main memory with dst_addr as base + unsigned char* dst_addr_main_memory_y = dst_addr; + unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size; + unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2); + + // calculate scale factors + vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width ); + float y_scale = (float)src_height/(float)dst_height; + + // double buffered processing + // buffer switching + unsigned int curr_src_idx = 0; + unsigned int curr_dst_idx = 0; + unsigned int next_src_idx, next_dst_idx; + + // 2 lines y as output, upper and lowerline + unsigned int curr_interpl_y_upper = 0; + unsigned int next_interpl_y_upper; + unsigned int curr_interpl_y_lower, next_interpl_y_lower; + // only 1 line v/u output, both planes have the same dimension + unsigned int curr_interpl_vu = 0; + unsigned int next_interpl_vu; + + // weights, calculated in every loop iteration + vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_y_upper; + vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower; + vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_vu; + + // line indices for the src picture + float curr_src_y_upper = 0.0f, next_src_y_upper; + float curr_src_y_lower, next_src_y_lower; + float curr_src_vu = 0.0f, next_src_vu; + + // line indices for the dst picture + unsigned int dst_y=0, dst_vu=0; + + // calculate lower line idices + curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale; + curr_interpl_y_lower = (unsigned int)curr_src_y_lower; + // lower line weight + vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower ); + + + // start partially double buffered processing + // get initial data, 2 sets of y, 1 set v, 1 set u + mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 ); + mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF, + 0, 0 ); + mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + + // iteration loop + // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved + // the scaled output is 2 lines y, 1 line v, 1 line u + // the yuv2rgb-converted output is stored to RAM + for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) { + dst_y = dst_vu<<1; + + // calculate next indices + next_src_vu = ((float)dst_vu+1)*y_scale; + next_src_y_upper = ((float)dst_y+2)*y_scale; + next_src_y_lower = ((float)dst_y+3)*y_scale; + + next_interpl_vu = (unsigned int) next_src_vu; + next_interpl_y_upper = (unsigned int) next_src_y_upper; + next_interpl_y_lower = (unsigned int) next_src_y_lower; + + // calculate weight NORTH-SOUTH + vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu ); + vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper ); + vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower ); + + // get next lines + next_src_idx = curr_src_idx^1; + next_dst_idx = curr_dst_idx^1; + + // 4 lines y + mfc_get( y_plane[next_src_idx], + (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + + // 2 lines v + mfc_get( v_plane[next_src_idx], + (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu), + src_dbl_linestride_vu, + RETR_BUF+next_src_idx, + 0, 0 ); + // 2 lines u + mfc_get( u_plane[next_src_idx], + (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu), + src_dbl_linestride_vu, + RETR_BUF+next_src_idx, + 0, 0 ); + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w16( v_plane[curr_src_idx], + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w16( u_plane[curr_src_idx], + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + //--------------------------------------------------------------------------------------------- + + + // update for next cycle + curr_src_idx = next_src_idx; + curr_dst_idx = next_dst_idx; + + curr_interpl_y_upper = next_interpl_y_upper; + curr_interpl_y_lower = next_interpl_y_lower; + curr_interpl_vu = next_interpl_vu; + + vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper; + vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower; + vf_curr_NSweight_vu = vf_next_NSweight_vu; + + curr_src_y_upper = next_src_y_upper; + curr_src_y_lower = next_src_y_lower; + curr_src_vu = next_src_vu; + } + + + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w16( v_plane[curr_src_idx], + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w16( u_plane[curr_src_idx], + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + // wait for completion + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + //--------------------------------------------------------------------------------------------- +} + + +/** + * scale_srcw32_dstw32() + * + * processes an input image of width 32 + * scaling is done to a width 32 + * yuv2rgb conversion on a width of 32 + * result stored in RAM + */ +void scale_srcw32_dstw32() { + // extract parameters + unsigned char* dst_addr = (unsigned char *)parms.dstBuffer; + + unsigned int src_width = parms.src_pixel_width; + unsigned int src_height = parms.src_pixel_height; + unsigned int dst_width = parms.dst_pixel_width; + unsigned int dst_height = parms.dst_pixel_height; + + // YVU + unsigned int src_linestride_y = src_width; + unsigned int src_dbl_linestride_y = src_width<<1; + unsigned int src_linestride_vu = src_width>>1; + unsigned int src_dbl_linestride_vu = src_width; + + // scaled YVU + unsigned int scaled_src_linestride_y = dst_width; + + // ram addresses + unsigned char* src_addr_y = parms.y_plane; + unsigned char* src_addr_v = parms.v_plane; + unsigned char* src_addr_u = parms.u_plane; + + unsigned int dst_picture_size = dst_width*dst_height; + + // Sizes for destination + unsigned int dst_dbl_linestride_y = dst_width<<1; + unsigned int dst_dbl_linestride_vu = dst_width>>1; + + // Perform address calculation for Y, V and U in main memory with dst_addr as base + unsigned char* dst_addr_main_memory_y = dst_addr; + unsigned char* dst_addr_main_memory_v = dst_addr + dst_picture_size; + unsigned char* dst_addr_main_memory_u = dst_addr_main_memory_v +(dst_picture_size>>2); + + // calculate scale factors + vector float vf_x_scale = spu_splats( (float)src_width/(float)dst_width ); + float y_scale = (float)src_height/(float)dst_height; + + // double buffered processing + // buffer switching + unsigned int curr_src_idx = 0; + unsigned int curr_dst_idx = 0; + unsigned int next_src_idx, next_dst_idx; + + // 2 lines y as output, upper and lowerline + unsigned int curr_interpl_y_upper = 0; + unsigned int next_interpl_y_upper; + unsigned int curr_interpl_y_lower, next_interpl_y_lower; + // only 1 line v/u output, both planes have the same dimension + unsigned int curr_interpl_vu = 0; + unsigned int next_interpl_vu; + + // weights, calculated in every loop iteration + vector float vf_curr_NSweight_y_upper = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_y_upper; + vector float vf_curr_NSweight_y_lower, vf_next_NSweight_y_lower; + vector float vf_curr_NSweight_vu = { 0.0f, 0.0f, 0.0f, 0.0f }; + vector float vf_next_NSweight_vu; + + // line indices for the src picture + float curr_src_y_upper = 0.0f, next_src_y_upper; + float curr_src_y_lower, next_src_y_lower; + float curr_src_vu = 0.0f, next_src_vu; + + // line indices for the dst picture + unsigned int dst_y=0, dst_vu=0; + + // calculate lower line idices + curr_src_y_lower = ((float)curr_interpl_y_upper+1)*y_scale; + curr_interpl_y_lower = (unsigned int)curr_src_y_lower; + // lower line weight + vf_curr_NSweight_y_lower = spu_splats( curr_src_y_lower-(float)curr_interpl_y_lower ); + + + // start partially double buffered processing + // get initial data, 2 sets of y, 1 set v, 1 set u + mfc_get( y_plane[curr_src_idx], (unsigned int) src_addr_y, src_dbl_linestride_y, RETR_BUF, 0, 0 ); + mfc_get( y_plane[curr_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(curr_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF, + 0, 0 ); + mfc_get( v_plane[curr_src_idx], (unsigned int) src_addr_v, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + mfc_get( u_plane[curr_src_idx], (unsigned int) src_addr_u, src_dbl_linestride_vu, RETR_BUF, 0, 0 ); + + // iteration loop + // within each iteration 4 lines y, 2 lines v, 2 lines u are retrieved + // the scaled output is 2 lines y, 1 line v, 1 line u + // the yuv2rgb-converted output is stored to RAM + for( dst_vu=0; dst_vu<(dst_height>>1)-1; dst_vu++ ) { + dst_y = dst_vu<<1; + + // calculate next indices + next_src_vu = ((float)dst_vu+1)*y_scale; + next_src_y_upper = ((float)dst_y+2)*y_scale; + next_src_y_lower = ((float)dst_y+3)*y_scale; + + next_interpl_vu = (unsigned int) next_src_vu; + next_interpl_y_upper = (unsigned int) next_src_y_upper; + next_interpl_y_lower = (unsigned int) next_src_y_lower; + + // calculate weight NORTH-SOUTH + vf_next_NSweight_vu = spu_splats( next_src_vu-(float)next_interpl_vu ); + vf_next_NSweight_y_upper = spu_splats( next_src_y_upper-(float)next_interpl_y_upper ); + vf_next_NSweight_y_lower = spu_splats( next_src_y_lower-(float)next_interpl_y_lower ); + + // get next lines + next_src_idx = curr_src_idx^1; + next_dst_idx = curr_dst_idx^1; + + // 4 lines y + mfc_get( y_plane[next_src_idx], + (unsigned int) src_addr_y+(next_interpl_y_upper*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + mfc_get( y_plane[next_src_idx]+src_dbl_linestride_y, + (unsigned int) src_addr_y+(next_interpl_y_lower*src_linestride_y), + src_dbl_linestride_y, + RETR_BUF+next_src_idx, + 0, 0 ); + + // 2 lines v + mfc_get( v_plane[next_src_idx], + (unsigned int) src_addr_v+(next_interpl_vu*src_linestride_vu), + src_dbl_linestride_vu, + RETR_BUF+next_src_idx, + 0, 0 ); + // 2 lines u + mfc_get( u_plane[next_src_idx], + (unsigned int) src_addr_u+(next_interpl_vu*src_linestride_vu), + src_dbl_linestride_vu, + RETR_BUF+next_src_idx, + 0, 0 ); + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w16( v_plane[curr_src_idx], + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w16( u_plane[curr_src_idx], + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + + // Store the result back to main memory into a destination buffer in YUV format + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + //--------------------------------------------------------------------------------------------- + + + // update for next cycle + curr_src_idx = next_src_idx; + curr_dst_idx = next_dst_idx; + + curr_interpl_y_upper = next_interpl_y_upper; + curr_interpl_y_lower = next_interpl_y_lower; + curr_interpl_vu = next_interpl_vu; + + vf_curr_NSweight_y_upper = vf_curr_NSweight_y_upper; + vf_curr_NSweight_y_lower = vf_curr_NSweight_y_lower; + vf_curr_NSweight_vu = vf_next_NSweight_vu; + + curr_src_y_upper = next_src_y_upper; + curr_src_y_lower = next_src_y_lower; + curr_src_vu = next_src_vu; + } + + + + DMA_WAIT_TAG( (RETR_BUF+curr_src_idx) ); + + // scaling + // work line y_upper + bilinear_scale_line_w16( y_plane[curr_src_idx], + scaled_y_plane[curr_src_idx], + dst_width, + vf_x_scale, + vf_curr_NSweight_y_upper, + src_linestride_y ); + // work line y_lower + bilinear_scale_line_w16( y_plane[curr_src_idx]+src_dbl_linestride_y, + scaled_y_plane[curr_src_idx]+scaled_src_linestride_y, + dst_width, + vf_x_scale, + vf_curr_NSweight_y_lower, + src_linestride_y ); + // work line v + bilinear_scale_line_w16( v_plane[curr_src_idx], + scaled_v_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + // work line u + bilinear_scale_line_w16( u_plane[curr_src_idx], + scaled_u_plane[curr_src_idx], + dst_width>>1, + vf_x_scale, + vf_curr_NSweight_vu, + src_linestride_vu ); + + + // Store the result back to main memory into a destination buffer in YUV format + //--------------------------------------------------------------------------------------------- + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + + // Perform three DMA transfers to 3 different locations in the main memory! + // dst_width: Pixel width of destination image + // dst_addr: Destination address in main memory + // dst_vu: Counter which is incremented one by one + // dst_y: Counter which is twice larger than dst_vu (dst_y = 2*dst_vu) + + mfc_put( scaled_y_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_y + (dst_vu*dst_dbl_linestride_y), // Destination in main memory (addr) + dst_dbl_linestride_y, // Two Y lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_v_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_v + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two V lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + mfc_put( scaled_u_plane[curr_src_idx], // What from local store (addr) + (unsigned int) dst_addr_main_memory_u + (dst_vu*dst_dbl_linestride_vu), // Destination in main memory (addr) + dst_dbl_linestride_vu, // Two U lines (depending on the widht of the destination resolution) + STR_BUF+curr_dst_idx, // Tag + 0, 0 ); + + // wait for completion + DMA_WAIT_TAG( (STR_BUF+curr_dst_idx) ); + //--------------------------------------------------------------------------------------------- +} + + +/* + * bilinear_scale_line_w8() + * + * processes a line of yuv-input, width has to be a multiple of 8 + * scaled yuv-output is written to local store buffer + * + * @param src buffer for 2 lines input + * @param dst_ buffer for 1 line output + * @param dst_width the width of the destination line + * @param vf_x_scale a float vector, at each entry is the x_scale-factor + * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line + * @param src_linestride the stride of the srcline + */ +void bilinear_scale_line_w8( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) { + + unsigned char* dst = dst_; + + unsigned int dst_x; + for( dst_x=0; dst_x<dst_width; dst_x+=8) { + // address calculation for loading the 4 surrounding pixel of each calculated + // destination pixel + vector unsigned int vui_dst_x_tmp = spu_splats( dst_x ); + // lower range->first 4 pixel + // upper range->next 4 pixel + vector unsigned int vui_inc_dst_x_lower_range = { 0, 1, 2, 3 }; + vector unsigned int vui_inc_dst_x_upper_range = { 4, 5, 6, 7 }; + vector unsigned int vui_dst_x_lower_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_lower_range ); + vector unsigned int vui_dst_x_upper_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_upper_range ); + + // calculate weight EAST-WEST + vector float vf_dst_x_lower_range = spu_convtf( vui_dst_x_lower_range, 0 ); + vector float vf_dst_x_upper_range = spu_convtf( vui_dst_x_upper_range, 0 ); + vector float vf_src_x_lower_range = spu_mul( vf_dst_x_lower_range, vf_x_scale ); + vector float vf_src_x_upper_range = spu_mul( vf_dst_x_upper_range, vf_x_scale ); + vector unsigned int vui_interpl_x_lower_range = spu_convtu( vf_src_x_lower_range, 0 ); + vector unsigned int vui_interpl_x_upper_range = spu_convtu( vf_src_x_upper_range, 0 ); + vector float vf_interpl_x_lower_range = spu_convtf( vui_interpl_x_lower_range, 0 ); + vector float vf_interpl_x_upper_range = spu_convtf( vui_interpl_x_upper_range, 0 ); + vector float vf_EWweight_lower_range = spu_sub( vf_src_x_lower_range, vf_interpl_x_lower_range ); + vector float vf_EWweight_upper_range = spu_sub( vf_src_x_upper_range, vf_interpl_x_upper_range ); + + // calculate address offset + // + // pixel NORTH WEST + vector unsigned int vui_off_pixelNW_lower_range = vui_interpl_x_lower_range; + vector unsigned int vui_off_pixelNW_upper_range = vui_interpl_x_upper_range; + + // pixel NORTH EAST-->(offpixelNW+1) + vector unsigned int vui_add_1 = { 1, 1, 1, 1 }; + vector unsigned int vui_off_pixelNE_lower_range = spu_add( vui_off_pixelNW_lower_range, vui_add_1 ); + vector unsigned int vui_off_pixelNE_upper_range = spu_add( vui_off_pixelNW_upper_range, vui_add_1 ); + + // SOUTH-WEST-->(offpixelNW+src_linestride) + vector unsigned int vui_srclinestride = spu_splats( src_linestride ); + vector unsigned int vui_off_pixelSW_lower_range = spu_add( vui_srclinestride, vui_off_pixelNW_lower_range ); + vector unsigned int vui_off_pixelSW_upper_range = spu_add( vui_srclinestride, vui_off_pixelNW_upper_range ); + + // SOUTH-EAST-->(offpixelNW+src_linestride+1) + vector unsigned int vui_off_pixelSE_lower_range = spu_add( vui_srclinestride, vui_off_pixelNE_lower_range ); + vector unsigned int vui_off_pixelSE_upper_range = spu_add( vui_srclinestride, vui_off_pixelNE_upper_range ); + + // calculate each address + vector unsigned int vui_src_ls = spu_splats( (unsigned int) src ); + vector unsigned int vui_addr_pixelNW_lower_range = spu_add( vui_src_ls, vui_off_pixelNW_lower_range ); + vector unsigned int vui_addr_pixelNW_upper_range = spu_add( vui_src_ls, vui_off_pixelNW_upper_range ); + vector unsigned int vui_addr_pixelNE_lower_range = spu_add( vui_src_ls, vui_off_pixelNE_lower_range ); + vector unsigned int vui_addr_pixelNE_upper_range = spu_add( vui_src_ls, vui_off_pixelNE_upper_range ); + + vector unsigned int vui_addr_pixelSW_lower_range = spu_add( vui_src_ls, vui_off_pixelSW_lower_range ); + vector unsigned int vui_addr_pixelSW_upper_range = spu_add( vui_src_ls, vui_off_pixelSW_upper_range ); + vector unsigned int vui_addr_pixelSE_lower_range = spu_add( vui_src_ls, vui_off_pixelSE_lower_range ); + vector unsigned int vui_addr_pixelSE_upper_range = spu_add( vui_src_ls, vui_off_pixelSE_upper_range ); + + // get each pixel + // + // scalar load, afterwards insertion into the right position + // NORTH WEST + vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + vector unsigned char vuc_pixel_NW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 1 )), + vuc_pixel_NW_lower_range, 7 ); + vuc_pixel_NW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 2 )), + vuc_pixel_NW_lower_range, 11 ); + vuc_pixel_NW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_lower_range, 3 )), + vuc_pixel_NW_lower_range, 15 ); + + vector unsigned char vuc_pixel_NW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 1 )), + vuc_pixel_NW_upper_range, 7 ); + vuc_pixel_NW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 2 )), + vuc_pixel_NW_upper_range, 11 ); + vuc_pixel_NW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_upper_range, 3 )), + vuc_pixel_NW_upper_range, 15 ); + + // NORTH EAST + vector unsigned char vuc_pixel_NE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 1 )), + vuc_pixel_NE_lower_range, 7 ); + vuc_pixel_NE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 2 )), + vuc_pixel_NE_lower_range, 11 ); + vuc_pixel_NE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_lower_range, 3 )), + vuc_pixel_NE_lower_range, 15 ); + + vector unsigned char vuc_pixel_NE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 1 )), + vuc_pixel_NE_upper_range, 7 ); + vuc_pixel_NE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 2 )), + vuc_pixel_NE_upper_range, 11 ); + vuc_pixel_NE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_upper_range, 3 )), + vuc_pixel_NE_upper_range, 15 ); + + + // SOUTH WEST + vector unsigned char vuc_pixel_SW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 1 )), + vuc_pixel_SW_lower_range, 7 ); + vuc_pixel_SW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 2 )), + vuc_pixel_SW_lower_range, 11 ); + vuc_pixel_SW_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_lower_range, 3 )), + vuc_pixel_SW_lower_range, 15 ); + + vector unsigned char vuc_pixel_SW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 1 )), + vuc_pixel_SW_upper_range, 7 ); + vuc_pixel_SW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 2 )), + vuc_pixel_SW_upper_range, 11 ); + vuc_pixel_SW_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_upper_range, 3 )), + vuc_pixel_SW_upper_range, 15 ); + + // SOUTH EAST + vector unsigned char vuc_pixel_SE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 1 )), + vuc_pixel_SE_lower_range, 7 ); + vuc_pixel_SE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 2 )), + vuc_pixel_SE_lower_range, 11 ); + vuc_pixel_SE_lower_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_lower_range, 3 )), + vuc_pixel_SE_lower_range, 15 ); + + vector unsigned char vuc_pixel_SE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 1 )), + vuc_pixel_SE_upper_range, 7 ); + vuc_pixel_SE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 2 )), + vuc_pixel_SE_upper_range, 11 ); + vuc_pixel_SE_upper_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_upper_range, 3 )), + vuc_pixel_SE_upper_range, 15 ); + + + // convert to float + vector float vf_pixel_NW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_lower_range, 0 ); + vector float vf_pixel_NW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_upper_range, 0 ); + + vector float vf_pixel_SW_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_lower_range, 0 ); + vector float vf_pixel_SW_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_upper_range, 0 ); + + vector float vf_pixel_NE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_lower_range, 0 ); + vector float vf_pixel_NE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_upper_range, 0 ); + + vector float vf_pixel_SE_lower_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_lower_range, 0 ); + vector float vf_pixel_SE_upper_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_upper_range, 0 ); + + + + // first linear interpolation: EWtop + // EWtop = NW + EWweight*(NE-NW) + // + // lower range + vector float vf_EWtop_lower_range_tmp = spu_sub( vf_pixel_NE_lower_range, vf_pixel_NW_lower_range ); + vector float vf_EWtop_lower_range = spu_madd( vf_EWweight_lower_range, + vf_EWtop_lower_range_tmp, + vf_pixel_NW_lower_range ); + + // upper range + vector float vf_EWtop_upper_range_tmp = spu_sub( vf_pixel_NE_upper_range, vf_pixel_NW_upper_range ); + vector float vf_EWtop_upper_range = spu_madd( vf_EWweight_upper_range, + vf_EWtop_upper_range_tmp, + vf_pixel_NW_upper_range ); + + + + // second linear interpolation: EWbottom + // EWbottom = SW + EWweight*(SE-SW) + // + // lower range + vector float vf_EWbottom_lower_range_tmp = spu_sub( vf_pixel_SE_lower_range, vf_pixel_SW_lower_range ); + vector float vf_EWbottom_lower_range = spu_madd( vf_EWweight_lower_range, + vf_EWbottom_lower_range_tmp, + vf_pixel_SW_lower_range ); + + // upper range + vector float vf_EWbottom_upper_range_tmp = spu_sub( vf_pixel_SE_upper_range, vf_pixel_SW_upper_range ); + vector float vf_EWbottom_upper_range = spu_madd( vf_EWweight_upper_range, + vf_EWbottom_upper_range_tmp, + vf_pixel_SW_upper_range ); + + + + // third linear interpolation: the bilinear interpolated value + // result = EWtop + NSweight*(EWbottom-EWtop); + // + // lower range + vector float vf_result_lower_range_tmp = spu_sub( vf_EWbottom_lower_range, vf_EWtop_lower_range ); + vector float vf_result_lower_range = spu_madd( vf_NSweight, + vf_result_lower_range_tmp, + vf_EWtop_lower_range ); + + // upper range + vector float vf_result_upper_range_tmp = spu_sub( vf_EWbottom_upper_range, vf_EWtop_upper_range ); + vector float vf_result_upper_range = spu_madd( vf_NSweight, + vf_result_upper_range_tmp, + vf_EWtop_upper_range ); + + + // convert back: using saturated arithmetic + vector unsigned int vui_result_lower_range = vfloat_to_vuint( vf_result_lower_range ); + vector unsigned int vui_result_upper_range = vfloat_to_vuint( vf_result_upper_range ); + + // merge results->lower,upper + vector unsigned char vuc_mask_merge_result = { 0x03, 0x07, 0x0B, 0x0F, + 0x13, 0x17, 0x1B, 0x1F, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 }; + + vector unsigned char vuc_result = spu_shuffle( (vector unsigned char) vui_result_lower_range, + (vector unsigned char) vui_result_upper_range, + vuc_mask_merge_result ); + + // partial storing + vector unsigned char vuc_mask_out = { 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF }; + + + // get currently stored data + vector unsigned char vuc_orig = *((vector unsigned char*)dst); + + // clear currently stored data + vuc_orig = spu_and( vuc_orig, + spu_rlqwbyte( vuc_mask_out, ((unsigned int)dst)&0x0F) ); + + // rotate result according to storing address + vuc_result = spu_rlqwbyte( vuc_result, ((unsigned int)dst)&0x0F ); + + // store result + *((vector unsigned char*)dst) = spu_or( vuc_result, + vuc_orig ); + dst += 8; + } +} + + +/* + * bilinear_scale_line_w16() + * + * processes a line of yuv-input, width has to be a multiple of 16 + * scaled yuv-output is written to local store buffer + * + * @param src buffer for 2 lines input + * @param dst_ buffer for 1 line output + * @param dst_width the width of the destination line + * @param vf_x_scale a float vector, at each entry is the x_scale-factor + * @param vf_NSweight a float vector, at each position is the weight NORTH/SOUTH for the current line + * @param src_linestride the stride of the srcline + */ +void bilinear_scale_line_w16( unsigned char* src, unsigned char* dst_, unsigned int dst_width, vector float vf_x_scale, vector float vf_NSweight, unsigned int src_linestride ) { + + unsigned char* dst = dst_; + + unsigned int dst_x; + for( dst_x=0; dst_x<dst_width; dst_x+=16) { + // address calculation for loading the 4 surrounding pixel of each calculated + // destination pixel + vector unsigned int vui_dst_x_tmp = spu_splats( dst_x ); + // parallelised processing + // first range->pixel 1 2 3 4 + // second range->pixel 5 6 7 8 + // third range->pixel 9 10 11 12 + // fourth range->pixel 13 14 15 16 + vector unsigned int vui_inc_dst_x_first_range = { 0, 1, 2, 3 }; + vector unsigned int vui_inc_dst_x_second_range = { 4, 5, 6, 7 }; + vector unsigned int vui_inc_dst_x_third_range = { 8, 9, 10, 11 }; + vector unsigned int vui_inc_dst_x_fourth_range = { 12, 13, 14, 15 }; + vector unsigned int vui_dst_x_first_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_first_range ); + vector unsigned int vui_dst_x_second_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_second_range ); + vector unsigned int vui_dst_x_third_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_third_range ); + vector unsigned int vui_dst_x_fourth_range = spu_add( vui_dst_x_tmp, vui_inc_dst_x_fourth_range ); + + // calculate weight EAST-WEST + vector float vf_dst_x_first_range = spu_convtf( vui_dst_x_first_range, 0 ); + vector float vf_dst_x_second_range = spu_convtf( vui_dst_x_second_range, 0 ); + vector float vf_dst_x_third_range = spu_convtf( vui_dst_x_third_range, 0 ); + vector float vf_dst_x_fourth_range = spu_convtf( vui_dst_x_fourth_range, 0 ); + vector float vf_src_x_first_range = spu_mul( vf_dst_x_first_range, vf_x_scale ); + vector float vf_src_x_second_range = spu_mul( vf_dst_x_second_range, vf_x_scale ); + vector float vf_src_x_third_range = spu_mul( vf_dst_x_third_range, vf_x_scale ); + vector float vf_src_x_fourth_range = spu_mul( vf_dst_x_fourth_range, vf_x_scale ); + vector unsigned int vui_interpl_x_first_range = spu_convtu( vf_src_x_first_range, 0 ); + vector unsigned int vui_interpl_x_second_range = spu_convtu( vf_src_x_second_range, 0 ); + vector unsigned int vui_interpl_x_third_range = spu_convtu( vf_src_x_third_range, 0 ); + vector unsigned int vui_interpl_x_fourth_range = spu_convtu( vf_src_x_fourth_range, 0 ); + vector float vf_interpl_x_first_range = spu_convtf( vui_interpl_x_first_range, 0 ); + vector float vf_interpl_x_second_range = spu_convtf( vui_interpl_x_second_range, 0 ); + vector float vf_interpl_x_third_range = spu_convtf( vui_interpl_x_third_range, 0 ); + vector float vf_interpl_x_fourth_range = spu_convtf( vui_interpl_x_fourth_range, 0 ); + vector float vf_EWweight_first_range = spu_sub( vf_src_x_first_range, vf_interpl_x_first_range ); + vector float vf_EWweight_second_range = spu_sub( vf_src_x_second_range, vf_interpl_x_second_range ); + vector float vf_EWweight_third_range = spu_sub( vf_src_x_third_range, vf_interpl_x_third_range ); + vector float vf_EWweight_fourth_range = spu_sub( vf_src_x_fourth_range, vf_interpl_x_fourth_range ); + + // calculate address offset + // + // pixel NORTH WEST + vector unsigned int vui_off_pixelNW_first_range = vui_interpl_x_first_range; + vector unsigned int vui_off_pixelNW_second_range = vui_interpl_x_second_range; + vector unsigned int vui_off_pixelNW_third_range = vui_interpl_x_third_range; + vector unsigned int vui_off_pixelNW_fourth_range = vui_interpl_x_fourth_range; + + // pixel NORTH EAST-->(offpixelNW+1) + vector unsigned int vui_add_1 = { 1, 1, 1, 1 }; + vector unsigned int vui_off_pixelNE_first_range = spu_add( vui_off_pixelNW_first_range, vui_add_1 ); + vector unsigned int vui_off_pixelNE_second_range = spu_add( vui_off_pixelNW_second_range, vui_add_1 ); + vector unsigned int vui_off_pixelNE_third_range = spu_add( vui_off_pixelNW_third_range, vui_add_1 ); + vector unsigned int vui_off_pixelNE_fourth_range = spu_add( vui_off_pixelNW_fourth_range, vui_add_1 ); + + // SOUTH-WEST-->(offpixelNW+src_linestride) + vector unsigned int vui_srclinestride = spu_splats( src_linestride ); + vector unsigned int vui_off_pixelSW_first_range = spu_add( vui_srclinestride, vui_off_pixelNW_first_range ); + vector unsigned int vui_off_pixelSW_second_range = spu_add( vui_srclinestride, vui_off_pixelNW_second_range ); + vector unsigned int vui_off_pixelSW_third_range = spu_add( vui_srclinestride, vui_off_pixelNW_third_range ); + vector unsigned int vui_off_pixelSW_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNW_fourth_range ); + + // SOUTH-EAST-->(offpixelNW+src_linestride+1) + vector unsigned int vui_off_pixelSE_first_range = spu_add( vui_srclinestride, vui_off_pixelNE_first_range ); + vector unsigned int vui_off_pixelSE_second_range = spu_add( vui_srclinestride, vui_off_pixelNE_second_range ); + vector unsigned int vui_off_pixelSE_third_range = spu_add( vui_srclinestride, vui_off_pixelNE_third_range ); + vector unsigned int vui_off_pixelSE_fourth_range = spu_add( vui_srclinestride, vui_off_pixelNE_fourth_range ); + + // calculate each address + vector unsigned int vui_src_ls = spu_splats( (unsigned int) src ); + vector unsigned int vui_addr_pixelNW_first_range = spu_add( vui_src_ls, vui_off_pixelNW_first_range ); + vector unsigned int vui_addr_pixelNW_second_range = spu_add( vui_src_ls, vui_off_pixelNW_second_range ); + vector unsigned int vui_addr_pixelNW_third_range = spu_add( vui_src_ls, vui_off_pixelNW_third_range ); + vector unsigned int vui_addr_pixelNW_fourth_range = spu_add( vui_src_ls, vui_off_pixelNW_fourth_range ); + + vector unsigned int vui_addr_pixelNE_first_range = spu_add( vui_src_ls, vui_off_pixelNE_first_range ); + vector unsigned int vui_addr_pixelNE_second_range = spu_add( vui_src_ls, vui_off_pixelNE_second_range ); + vector unsigned int vui_addr_pixelNE_third_range = spu_add( vui_src_ls, vui_off_pixelNE_third_range ); + vector unsigned int vui_addr_pixelNE_fourth_range = spu_add( vui_src_ls, vui_off_pixelNE_fourth_range ); + + vector unsigned int vui_addr_pixelSW_first_range = spu_add( vui_src_ls, vui_off_pixelSW_first_range ); + vector unsigned int vui_addr_pixelSW_second_range = spu_add( vui_src_ls, vui_off_pixelSW_second_range ); + vector unsigned int vui_addr_pixelSW_third_range = spu_add( vui_src_ls, vui_off_pixelSW_third_range ); + vector unsigned int vui_addr_pixelSW_fourth_range = spu_add( vui_src_ls, vui_off_pixelSW_fourth_range ); + + vector unsigned int vui_addr_pixelSE_first_range = spu_add( vui_src_ls, vui_off_pixelSE_first_range ); + vector unsigned int vui_addr_pixelSE_second_range = spu_add( vui_src_ls, vui_off_pixelSE_second_range ); + vector unsigned int vui_addr_pixelSE_third_range = spu_add( vui_src_ls, vui_off_pixelSE_third_range ); + vector unsigned int vui_addr_pixelSE_fourth_range = spu_add( vui_src_ls, vui_off_pixelSE_fourth_range ); + + + // get each pixel + // + // scalar load, afterwards insertion into the right position + // NORTH WEST + // first range + vector unsigned char null_vector = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; + vector unsigned char vuc_pixel_NW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 1 )), + vuc_pixel_NW_first_range, 7 ); + vuc_pixel_NW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 2 )), + vuc_pixel_NW_first_range, 11 ); + vuc_pixel_NW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_first_range, 3 )), + vuc_pixel_NW_first_range, 15 ); + // second range + vector unsigned char vuc_pixel_NW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 1 )), + vuc_pixel_NW_second_range, 7 ); + vuc_pixel_NW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 2 )), + vuc_pixel_NW_second_range, 11 ); + vuc_pixel_NW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_second_range, 3 )), + vuc_pixel_NW_second_range, 15 ); + // third range + vector unsigned char vuc_pixel_NW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 1 )), + vuc_pixel_NW_third_range, 7 ); + vuc_pixel_NW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 2 )), + vuc_pixel_NW_third_range, 11 ); + vuc_pixel_NW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_third_range, 3 )), + vuc_pixel_NW_third_range, 15 ); + // fourth range + vector unsigned char vuc_pixel_NW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 0 )), null_vector, 3 ); + vuc_pixel_NW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 1 )), + vuc_pixel_NW_fourth_range, 7 ); + vuc_pixel_NW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 2 )), + vuc_pixel_NW_fourth_range, 11 ); + vuc_pixel_NW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNW_fourth_range, 3 )), + vuc_pixel_NW_fourth_range, 15 ); + + // NORTH EAST + // first range + vector unsigned char vuc_pixel_NE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 1 )), + vuc_pixel_NE_first_range, 7 ); + vuc_pixel_NE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 2 )), + vuc_pixel_NE_first_range, 11 ); + vuc_pixel_NE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_first_range, 3 )), + vuc_pixel_NE_first_range, 15 ); + // second range + vector unsigned char vuc_pixel_NE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 1 )), + vuc_pixel_NE_second_range, 7 ); + vuc_pixel_NE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 2 )), + vuc_pixel_NE_second_range, 11 ); + vuc_pixel_NE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_second_range, 3 )), + vuc_pixel_NE_second_range, 15 ); + // third range + vector unsigned char vuc_pixel_NE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 1 )), + vuc_pixel_NE_third_range, 7 ); + vuc_pixel_NE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 2 )), + vuc_pixel_NE_third_range, 11 ); + vuc_pixel_NE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_third_range, 3 )), + vuc_pixel_NE_third_range, 15 ); + // fourth range + vector unsigned char vuc_pixel_NE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 0 )), null_vector, 3 ); + vuc_pixel_NE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 1 )), + vuc_pixel_NE_fourth_range, 7 ); + vuc_pixel_NE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 2 )), + vuc_pixel_NE_fourth_range, 11 ); + vuc_pixel_NE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelNE_fourth_range, 3 )), + vuc_pixel_NE_fourth_range, 15 ); + + // SOUTH WEST + // first range + vector unsigned char vuc_pixel_SW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 1 )), + vuc_pixel_SW_first_range, 7 ); + vuc_pixel_SW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 2 )), + vuc_pixel_SW_first_range, 11 ); + vuc_pixel_SW_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_first_range, 3 )), + vuc_pixel_SW_first_range, 15 ); + // second range + vector unsigned char vuc_pixel_SW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 1 )), + vuc_pixel_SW_second_range, 7 ); + vuc_pixel_SW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 2 )), + vuc_pixel_SW_second_range, 11 ); + vuc_pixel_SW_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_second_range, 3 )), + vuc_pixel_SW_second_range, 15 ); + // third range + vector unsigned char vuc_pixel_SW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 1 )), + vuc_pixel_SW_third_range, 7 ); + vuc_pixel_SW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 2 )), + vuc_pixel_SW_third_range, 11 ); + vuc_pixel_SW_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_third_range, 3 )), + vuc_pixel_SW_third_range, 15 ); + // fourth range + vector unsigned char vuc_pixel_SW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 0 )), null_vector, 3 ); + vuc_pixel_SW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 1 )), + vuc_pixel_SW_fourth_range, 7 ); + vuc_pixel_SW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 2 )), + vuc_pixel_SW_fourth_range, 11 ); + vuc_pixel_SW_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSW_fourth_range, 3 )), + vuc_pixel_SW_fourth_range, 15 ); + + // NORTH EAST + // first range + vector unsigned char vuc_pixel_SE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 1 )), + vuc_pixel_SE_first_range, 7 ); + vuc_pixel_SE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 2 )), + vuc_pixel_SE_first_range, 11 ); + vuc_pixel_SE_first_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_first_range, 3 )), + vuc_pixel_SE_first_range, 15 ); + // second range + vector unsigned char vuc_pixel_SE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 1 )), + vuc_pixel_SE_second_range, 7 ); + vuc_pixel_SE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 2 )), + vuc_pixel_SE_second_range, 11 ); + vuc_pixel_SE_second_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_second_range, 3 )), + vuc_pixel_SE_second_range, 15 ); + // third range + vector unsigned char vuc_pixel_SE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 1 )), + vuc_pixel_SE_third_range, 7 ); + vuc_pixel_SE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 2 )), + vuc_pixel_SE_third_range, 11 ); + vuc_pixel_SE_third_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_third_range, 3 )), + vuc_pixel_SE_third_range, 15 ); + // fourth range + vector unsigned char vuc_pixel_SE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 0 )), null_vector, 3 ); + vuc_pixel_SE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 1 )), + vuc_pixel_SE_fourth_range, 7 ); + vuc_pixel_SE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 2 )), + vuc_pixel_SE_fourth_range, 11 ); + vuc_pixel_SE_fourth_range = spu_insert( + *((unsigned char*) spu_extract( vui_addr_pixelSE_fourth_range, 3 )), + vuc_pixel_SE_fourth_range, 15 ); + + + + // convert to float + vector float vf_pixel_NW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_first_range, 0 ); + vector float vf_pixel_NW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_second_range, 0 ); + vector float vf_pixel_NW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_third_range, 0 ); + vector float vf_pixel_NW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NW_fourth_range, 0 ); + + vector float vf_pixel_NE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_first_range, 0 ); + vector float vf_pixel_NE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_second_range, 0 ); + vector float vf_pixel_NE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_third_range, 0 ); + vector float vf_pixel_NE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_NE_fourth_range, 0 ); + + vector float vf_pixel_SW_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_first_range, 0 ); + vector float vf_pixel_SW_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_second_range, 0 ); + vector float vf_pixel_SW_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_third_range, 0 ); + vector float vf_pixel_SW_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SW_fourth_range, 0 ); + + vector float vf_pixel_SE_first_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_first_range, 0 ); + vector float vf_pixel_SE_second_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_second_range, 0 ); + vector float vf_pixel_SE_third_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_third_range, 0 ); + vector float vf_pixel_SE_fourth_range = spu_convtf( (vector unsigned int) vuc_pixel_SE_fourth_range, 0 ); + + // first linear interpolation: EWtop + // EWtop = NW + EWweight*(NE-NW) + // + // first range + vector float vf_EWtop_first_range_tmp = spu_sub( vf_pixel_NE_first_range, vf_pixel_NW_first_range ); + vector float vf_EWtop_first_range = spu_madd( vf_EWweight_first_range, + vf_EWtop_first_range_tmp, + vf_pixel_NW_first_range ); + + // second range + vector float vf_EWtop_second_range_tmp = spu_sub( vf_pixel_NE_second_range, vf_pixel_NW_second_range ); + vector float vf_EWtop_second_range = spu_madd( vf_EWweight_second_range, + vf_EWtop_second_range_tmp, + vf_pixel_NW_second_range ); + + // third range + vector float vf_EWtop_third_range_tmp = spu_sub( vf_pixel_NE_third_range, vf_pixel_NW_third_range ); + vector float vf_EWtop_third_range = spu_madd( vf_EWweight_third_range, + vf_EWtop_third_range_tmp, + vf_pixel_NW_third_range ); + + // fourth range + vector float vf_EWtop_fourth_range_tmp = spu_sub( vf_pixel_NE_fourth_range, vf_pixel_NW_fourth_range ); + vector float vf_EWtop_fourth_range = spu_madd( vf_EWweight_fourth_range, + vf_EWtop_fourth_range_tmp, + vf_pixel_NW_fourth_range ); + + + + // second linear interpolation: EWbottom + // EWbottom = SW + EWweight*(SE-SW) + // + // first range + vector float vf_EWbottom_first_range_tmp = spu_sub( vf_pixel_SE_first_range, vf_pixel_SW_first_range ); + vector float vf_EWbottom_first_range = spu_madd( vf_EWweight_first_range, + vf_EWbottom_first_range_tmp, + vf_pixel_SW_first_range ); + + // second range + vector float vf_EWbottom_second_range_tmp = spu_sub( vf_pixel_SE_second_range, vf_pixel_SW_second_range ); + vector float vf_EWbottom_second_range = spu_madd( vf_EWweight_second_range, + vf_EWbottom_second_range_tmp, + vf_pixel_SW_second_range ); + // first range + vector float vf_EWbottom_third_range_tmp = spu_sub( vf_pixel_SE_third_range, vf_pixel_SW_third_range ); + vector float vf_EWbottom_third_range = spu_madd( vf_EWweight_third_range, + vf_EWbottom_third_range_tmp, + vf_pixel_SW_third_range ); + + // first range + vector float vf_EWbottom_fourth_range_tmp = spu_sub( vf_pixel_SE_fourth_range, vf_pixel_SW_fourth_range ); + vector float vf_EWbottom_fourth_range = spu_madd( vf_EWweight_fourth_range, + vf_EWbottom_fourth_range_tmp, + vf_pixel_SW_fourth_range ); + + + + // third linear interpolation: the bilinear interpolated value + // result = EWtop + NSweight*(EWbottom-EWtop); + // + // first range + vector float vf_result_first_range_tmp = spu_sub( vf_EWbottom_first_range, vf_EWtop_first_range ); + vector float vf_result_first_range = spu_madd( vf_NSweight, + vf_result_first_range_tmp, + vf_EWtop_first_range ); + + // second range + vector float vf_result_second_range_tmp = spu_sub( vf_EWbottom_second_range, vf_EWtop_second_range ); + vector float vf_result_second_range = spu_madd( vf_NSweight, + vf_result_second_range_tmp, + vf_EWtop_second_range ); + + // third range + vector float vf_result_third_range_tmp = spu_sub( vf_EWbottom_third_range, vf_EWtop_third_range ); + vector float vf_result_third_range = spu_madd( vf_NSweight, + vf_result_third_range_tmp, + vf_EWtop_third_range ); + + // fourth range + vector float vf_result_fourth_range_tmp = spu_sub( vf_EWbottom_fourth_range, vf_EWtop_fourth_range ); + vector float vf_result_fourth_range = spu_madd( vf_NSweight, + vf_result_fourth_range_tmp, + vf_EWtop_fourth_range ); + + + + // convert back: using saturated arithmetic + vector unsigned int vui_result_first_range = vfloat_to_vuint( vf_result_first_range ); + vector unsigned int vui_result_second_range = vfloat_to_vuint( vf_result_second_range ); + vector unsigned int vui_result_third_range = vfloat_to_vuint( vf_result_third_range ); + vector unsigned int vui_result_fourth_range = vfloat_to_vuint( vf_result_fourth_range ); + + // merge results->lower,upper + vector unsigned char vuc_mask_merge_result_first_second = { 0x03, 0x07, 0x0B, 0x0F, + 0x13, 0x17, 0x1B, 0x1F, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00 }; + + vector unsigned char vuc_mask_merge_result_third_fourth = { 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + 0x03, 0x07, 0x0B, 0x0F, + 0x13, 0x17, 0x1B, 0x1F }; + + vector unsigned char vuc_result_first_second = + spu_shuffle( (vector unsigned char) vui_result_first_range, + (vector unsigned char) vui_result_second_range, + vuc_mask_merge_result_first_second ); + + vector unsigned char vuc_result_third_fourth = + spu_shuffle( (vector unsigned char) vui_result_third_range, + (vector unsigned char) vui_result_fourth_range, + vuc_mask_merge_result_third_fourth ); + + // store result + *((vector unsigned char*)dst) = spu_or( vuc_result_first_second, + vuc_result_third_fourth ); + dst += 16; + } +} +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/spulibs/fb_writer.c Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,193 @@ +/* + * SDL - Simple DirectMedia Layer + * CELL BE Support for PS3 Framebuffer + * Copyright (C) 2008, 2009 International Business Machines Corporation + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + * + * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com> + * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com> + * SPE code based on research by: + * Rene Becker + * Thimo Emmerich + */ + +#include "spu_common.h" + +#include <spu_intrinsics.h> +#include <spu_mfcio.h> +#include <stdio.h> +#include <string.h> + +// Debugging +//#define DEBUG + +#ifdef DEBUG +#define deprintf(fmt, args... ) \ + fprintf( stdout, fmt, ##args ); \ + fflush( stdout ); +#else +#define deprintf( fmt, args... ) +#endif + +void cpy_to_fb(unsigned int); + +/* fb_writer_spu parms */ +static volatile struct fb_writer_parms_t parms __attribute__ ((aligned(128))); + +/* Code running on SPU */ +int main(unsigned long long spe_id __attribute__ ((unused)), unsigned long long argp __attribute__ ((unused))) +{ + deprintf("[SPU] fb_writer_spu is up... (on SPE #%llu)\n", spe_id); + uint32_t ea_mfc, mbox; + // send ready message + spu_write_out_mbox(SPU_READY); + + while (1) { + /* Check mailbox */ + mbox = spu_read_in_mbox(); + deprintf("[SPU] Message is %u\n", mbox); + switch (mbox) { + case SPU_EXIT: + deprintf("[SPU] fb_writer goes down...\n"); + return 0; + case SPU_START: + break; + default: + deprintf("[SPU] Cannot handle message\n"); + continue; + } + + /* Tag Manager setup */ + unsigned int tags; + tags = mfc_multi_tag_reserve(5); + if (tags == MFC_TAG_INVALID) { + deprintf("[SPU] Failed to reserve mfc tags on fb_writer\n"); + return 0; + } + + /* Framebuffer parms */ + ea_mfc = spu_read_in_mbox(); + deprintf("[SPU] Message on fb_writer is %u\n", ea_mfc); + spu_mfcdma32(&parms, (unsigned int)ea_mfc, + sizeof(struct fb_writer_parms_t), tags, + MFC_GET_CMD); + deprintf("[SPU] argp = %u\n", (unsigned int)argp); + DMA_WAIT_TAG(tags); + + /* Copy parms->data to framebuffer */ + deprintf("[SPU] Copying to framebuffer started\n"); + cpy_to_fb(tags); + deprintf("[SPU] Copying to framebuffer done!\n"); + + mfc_multi_tag_release(tags, 5); + deprintf("[SPU] fb_writer_spu... done!\n"); + /* Send FIN msg */ + spu_write_out_mbox(SPU_FIN); + } + + return 0; +} + +void cpy_to_fb(unsigned int tag_id_base) +{ + unsigned int i; + unsigned char current_buf; + uint8_t *in = parms.data; + + /* Align fb pointer which was centered before */ + uint8_t *fb = + (unsigned char *)((unsigned int)parms.center & 0xFFFFFFF0); + + uint32_t bounded_input_height = parms.bounded_input_height; + uint32_t bounded_input_width = parms.bounded_input_width; + uint32_t fb_pixel_size = parms.fb_pixel_size; + + uint32_t out_line_stride = parms.out_line_stride; + uint32_t in_line_stride = parms.in_line_stride; + uint32_t in_line_size = bounded_input_width * fb_pixel_size; + + current_buf = 0; + + /* Local store buffer */ + static volatile uint8_t buf[4][BUFFER_SIZE] + __attribute__ ((aligned(128))); + /* do 4-times multibuffering using DMA list, process in two steps */ + for (i = 0; i < bounded_input_height >> 2; i++) { + /* first buffer */ + DMA_WAIT_TAG(tag_id_base + 1); + // retrieve buffer + spu_mfcdma32(buf[0], (unsigned int)in, in_line_size, + tag_id_base + 1, MFC_GETB_CMD); + DMA_WAIT_TAG(tag_id_base + 1); + // store buffer + spu_mfcdma32(buf[0], (unsigned int)fb, in_line_size, + tag_id_base + 1, MFC_PUTB_CMD); + in += in_line_stride; + fb += out_line_stride; + deprintf("[SPU] 1st buffer copied in=0x%x, fb=0x%x\n", in, + fb); + + /* second buffer */ + DMA_WAIT_TAG(tag_id_base + 2); + // retrieve buffer + spu_mfcdma32(buf[1], (unsigned int)in, in_line_size, + tag_id_base + 2, MFC_GETB_CMD); + DMA_WAIT_TAG(tag_id_base + 2); + // store buffer + spu_mfcdma32(buf[1], (unsigned int)fb, in_line_size, + tag_id_base + 2, MFC_PUTB_CMD); + in += in_line_stride; + fb += out_line_stride; + deprintf("[SPU] 2nd buffer copied in=0x%x, fb=0x%x\n", in, + fb); + + /* third buffer */ + DMA_WAIT_TAG(tag_id_base + 3); + // retrieve buffer + spu_mfcdma32(buf[2], (unsigned int)in, in_line_size, + tag_id_base + 3, MFC_GETB_CMD); + DMA_WAIT_TAG(tag_id_base + 3); + // store buffer + spu_mfcdma32(buf[2], (unsigned int)fb, in_line_size, + tag_id_base + 3, MFC_PUTB_CMD); + in += in_line_stride; + fb += out_line_stride; + deprintf("[SPU] 3rd buffer copied in=0x%x, fb=0x%x\n", in, + fb); + + /* fourth buffer */ + DMA_WAIT_TAG(tag_id_base + 4); + // retrieve buffer + spu_mfcdma32(buf[3], (unsigned int)in, in_line_size, + tag_id_base + 4, MFC_GETB_CMD); + DMA_WAIT_TAG(tag_id_base + 4); + // store buffer + spu_mfcdma32(buf[3], (unsigned int)fb, in_line_size, + tag_id_base + 4, MFC_PUTB_CMD); + in += in_line_stride; + fb += out_line_stride; + deprintf("[SPU] 4th buffer copied in=0x%x, fb=0x%x\n", in, + fb); + deprintf("[SPU] Loop #%i, bounded_input_height=%i\n", i, + bounded_input_height >> 2); + } + DMA_WAIT_TAG(tag_id_base + 2); + DMA_WAIT_TAG(tag_id_base + 3); + DMA_WAIT_TAG(tag_id_base + 4); +} + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/spulibs/spu_common.h Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,108 @@ +/* + * SDL - Simple DirectMedia Layer + * CELL BE Support for PS3 Framebuffer + * Copyright (C) 2008, 2009 International Business Machines Corporation + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + * + * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com> + * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com> + * SPE code based on research by: + * Rene Becker + * Thimo Emmerich + */ + +/* Common definitions/makros for SPUs */ + +#ifndef _SPU_COMMON_H +#define _SPU_COMMON_H + +#include <stdio.h> +#include <stdint.h> +#include <string.h> + +/* Tag management */ +#define DMA_WAIT_TAG(_tag) \ + mfc_write_tag_mask(1<<(_tag)); \ + mfc_read_tag_status_all(); + +/* SPU mailbox messages */ +#define SPU_READY 0 +#define SPU_START 1 +#define SPU_FIN 2 +#define SPU_EXIT 3 + +/* Tags */ +#define RETR_BUF 0 +#define STR_BUF 1 +#define TAG_INIT 2 + +/* Buffersizes */ +#define MAX_HDTV_WIDTH 1920 +#define MAX_HDTV_HEIGHT 1080 +/* One stride of HDTV */ +#define BUFFER_SIZE 7680 + +/* fb_writer ppu/spu exchange parms */ +struct fb_writer_parms_t { + uint8_t *data; + uint8_t *center; + uint32_t out_line_stride; + uint32_t in_line_stride; + uint32_t bounded_input_height; + uint32_t bounded_input_width; + uint32_t fb_pixel_size; + + /* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */ + char padding[4]; +} __attribute__((aligned(128))); + +/* yuv2rgb ppu/spu exchange parms */ +struct yuv2rgb_parms_t { + uint8_t* y_plane; + uint8_t* v_plane; + uint8_t* u_plane; + + uint8_t* dstBuffer; + + unsigned int src_pixel_width; + unsigned int src_pixel_height; + + /* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */ + char padding[128 - ((4 * sizeof(uint8_t *) + 2 * sizeof(unsigned int)) & 0x7F)]; +} __attribute__((aligned(128))); + +/* bilin_scaler ppu/spu exchange parms */ +struct scale_parms_t { + uint8_t* y_plane; + uint8_t* v_plane; + uint8_t* u_plane; + + uint8_t* dstBuffer; + + unsigned int src_pixel_width; + unsigned int src_pixel_height; + + unsigned int dst_pixel_width; + unsigned int dst_pixel_height; + + /* This padding is to fulfill the need for 16 byte alignment. On parm change, update! */ + char padding[128 - ((4 * sizeof(uint8_t *) + 4 * sizeof(unsigned int)) & 0x7F)]; +} __attribute__((aligned(128))); + +#endif /* _SPU_COMMON_H */ + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/video/ps3/spulibs/yuv2rgb.c Mon Sep 07 04:51:29 2009 +0000 @@ -0,0 +1,662 @@ +/* + * SDL - Simple DirectMedia Layer + * CELL BE Support for PS3 Framebuffer + * Copyright (C) 2008, 2009 International Business Machines Corporation + * + * This library is free software; you can redistribute it and/or modify it + * under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + * + * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com> + * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com> + * SPE code based on research by: + * Rene Becker + * Thimo Emmerich + */ + +#include "spu_common.h" + +#include <spu_intrinsics.h> +#include <spu_mfcio.h> + +// Debugging +//#define DEBUG + +// Test environment for /2 resolutions +//#define TESTING + +#ifdef DEBUG +#define deprintf(fmt, args... ) \ + fprintf( stdout, fmt, ##args ); \ + fflush( stdout ); +#else +#define deprintf( fmt, args... ) +#endif + +struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128))); + +/* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored + * there might be the need to retrieve misaligned data, adjust + * incoming v and u plane to be able to handle this (add 128) + */ +unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128))); +unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128))); +unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128))); + +/* A maximum of 4 lines BGRA are stored, 4 byte per pixel */ +unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128))); + +/* some vectors needed by the float to int conversion */ +static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f }; +static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f }; + +void yuv_to_rgb_w16(); +void yuv_to_rgb_w32(); + +void yuv_to_rgb_w2_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width); +void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width); + + +int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused))) +{ + deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id); + uint32_t ea_mfc, mbox; + // send ready message + spu_write_out_mbox(SPU_READY); + + while (1) { + /* Check mailbox */ + mbox = spu_read_in_mbox(); + deprintf("[SPU] Message is %u\n", mbox); + switch (mbox) { + case SPU_EXIT: + deprintf("[SPU] yuv2rgb_converter goes down...\n"); + return 0; + case SPU_START: + break; + default: + deprintf("[SPU] Cannot handle message\n"); + continue; + } + + /* Tag Manager setup */ + unsigned int tag_id; + tag_id = mfc_multi_tag_reserve(1); + if (tag_id == MFC_TAG_INVALID) { + deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n"); + return 0; + } + + /* DMA transfer for the input parameters */ + ea_mfc = spu_read_in_mbox(); + deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc); + spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD); + DMA_WAIT_TAG(tag_id); + + /* There are alignment issues that involve handling of special cases + * a width of 32 results in a width of 16 in the chrominance + * --> choose the proper handling to optimize the performance + */ + deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height); + if (!(parms_converter.src_pixel_width & 0x1f)) { + deprintf("[SPU] Using yuv_to_rgb_w16\n"); + yuv_to_rgb_w16(); + } else { + deprintf("[SPU] Using yuv_to_rgb_w32\n"); + yuv_to_rgb_w32(); + } + + mfc_multi_tag_release(tag_id, 1); + deprintf("[SPU] yuv2rgb_spu... done!\n"); + /* Send FIN message */ + spu_write_out_mbox(SPU_FIN); + } + + return 0; +} + + +/* + * float_to_char() + * + * converts a float to a character using saturated + * arithmetic + * + * @param s float for conversion + * @returns converted character + */ +inline static unsigned char float_to_char(float s) { + vector float vec_s = spu_splats(s); + vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); + vec_s = spu_sel(vec_s, vec_0_1, select_1); + + vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); + vec_s = spu_sel(vec_s, vec_255, select_2); + return (unsigned char) spu_extract(vec_s,0); +} + + +/* + * vfloat_to_vuint() + * + * converts a float vector to an unsinged int vector using saturated + * arithmetic + * + * @param vec_s float vector for conversion + * @returns converted unsigned int vector + */ +inline static vector unsigned int vfloat_to_vuint(vector float vec_s) { + vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); + vec_s = spu_sel(vec_s, vec_0_1, select_1); + + vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); + vec_s = spu_sel(vec_s, vec_255, select_2); + return spu_convtu(vec_s,0); +} + + +void yuv_to_rgb_w16() { + // Pixel dimensions of the picture + uint32_t width, height; + + // Extract parameters + width = parms_converter.src_pixel_width; + height = parms_converter.src_pixel_height; + + // Plane data management + // Y + unsigned char* ram_addr_y = parms_converter.y_plane; + // V + unsigned char* ram_addr_v = parms_converter.v_plane; + // U + unsigned char* ram_addr_u = parms_converter.u_plane; + + // BGRA + unsigned char* ram_addr_bgra = parms_converter.dstBuffer; + + // Strides + unsigned int stride_y = width; + unsigned int stride_vu = width>>1; + + // Buffer management + unsigned int buf_idx = 0; + unsigned int size_4lines_y = stride_y<<2; + unsigned int size_2lines_y = stride_y<<1; + unsigned int size_2lines_vu = stride_vu<<1; + + // 2*width*4byte_per_pixel + unsigned int size_2lines_bgra = width<<3; + + + // start double-buffered processing + // 4 lines y + spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD); + + // 2 lines v + spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); + + // 2 lines u + spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); + + // Wait for these transfers to be completed + DMA_WAIT_TAG((RETR_BUF + buf_idx)); + + unsigned int i; + for(i=0; i<(height>>2)-1; i++) { + + buf_idx^=1; + + // 4 lines y + spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD); + + // 2 lines v + spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); + + // 2 lines u + spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); + + DMA_WAIT_TAG((RETR_BUF + buf_idx)); + + buf_idx^=1; + + + // Convert YUV to BGRA, store it back (first two lines) +#ifndef TESTING + yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); + + // Next two lines + yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y, + v_plane[buf_idx] + stride_vu, + u_plane[buf_idx] + stride_vu, + bgra + size_2lines_bgra, + width); +#else + yuv_to_rgb_w2_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); + + // Next two lines + yuv_to_rgb_w2_line(y_plane[buf_idx] + size_2lines_y, + v_plane[buf_idx] + stride_vu, + u_plane[buf_idx] + stride_vu, + bgra + size_2lines_bgra, + width); +#endif + + // Wait for previous storing transfer to be completed + DMA_WAIT_TAG(STR_BUF); + + // Store converted lines in two steps->max transfer size 16384 + spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); + ram_addr_bgra += size_2lines_bgra; + spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); + ram_addr_bgra += size_2lines_bgra; + + // Move 4 lines + ram_addr_y += size_4lines_y; + ram_addr_v += size_2lines_vu; + ram_addr_u += size_2lines_vu; + + buf_idx^=1; + } + +#ifndef TESTING + // Convert YUV to BGRA, store it back (first two lines) + yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); + + // Next two lines + yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y, + v_plane[buf_idx] + stride_vu, + u_plane[buf_idx] + stride_vu, + bgra + size_2lines_bgra, + width); +#else + // Convert YUV to BGRA, store it back (first two lines) + yuv_to_rgb_w2_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); + + // Next two lines + yuv_to_rgb_w2_line(y_plane[buf_idx] + size_2lines_y, + v_plane[buf_idx] + stride_vu, + u_plane[buf_idx] + stride_vu, + bgra + size_2lines_bgra, + width); +#endif + + // Wait for previous storing transfer to be completed + DMA_WAIT_TAG(STR_BUF); + spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); + ram_addr_bgra += size_2lines_bgra; + spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); + + // wait for previous storing transfer to be completed + DMA_WAIT_TAG(STR_BUF); + +} + + +void yuv_to_rgb_w32() { + // Pixel dimensions of the picture + uint32_t width, height; + + // Extract parameters + width = parms_converter.src_pixel_width; + height = parms_converter.src_pixel_height; + + // Plane data management + // Y + unsigned char* ram_addr_y = parms_converter.y_plane; + // V + unsigned char* ram_addr_v = parms_converter.v_plane; + // U + unsigned char* ram_addr_u = parms_converter.u_plane; + + // BGRA + unsigned char* ram_addr_bgra = parms_converter.dstBuffer; + + // Strides + unsigned int stride_y = width; + unsigned int stride_vu = width>>1; + + // Buffer management + unsigned int buf_idx = 0; + unsigned int size_4lines_y = stride_y<<2; + unsigned int size_2lines_y = stride_y<<1; + unsigned int size_2lines_vu = stride_vu<<1; + + // 2*width*4byte_per_pixel + unsigned int size_2lines_bgra = width<<3; + + // start double-buffered processing + // 4 lines y + spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD); + // 2 lines v + spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); + // 2 lines u + spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); + + // Wait for these transfers to be completed + DMA_WAIT_TAG((RETR_BUF + buf_idx)); + + unsigned int i; + for(i=0; i < (height>>2)-1; i++) { + buf_idx^=1; + // 4 lines y + spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD); + deprintf("4lines = %d\n", size_4lines_y); + // 2 lines v + spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); + deprintf("2lines = %d\n", size_2lines_vu); + // 2 lines u + spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); + deprintf("2lines = %d\n", size_2lines_vu); + + DMA_WAIT_TAG((RETR_BUF + buf_idx)); + + buf_idx^=1; + + // Convert YUV to BGRA, store it back (first two lines) + yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); + + // Next two lines + yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y, + v_plane[buf_idx] + stride_vu, + u_plane[buf_idx] + stride_vu, + bgra + size_2lines_bgra, + width); + + // Wait for previous storing transfer to be completed + DMA_WAIT_TAG(STR_BUF); + + // Store converted lines in two steps->max transfer size 16384 + spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); + ram_addr_bgra += size_2lines_bgra; + spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); + ram_addr_bgra += size_2lines_bgra; + + // Move 4 lines + ram_addr_y += size_4lines_y; + ram_addr_v += size_2lines_vu; + ram_addr_u += size_2lines_vu; + + buf_idx^=1; + } + + // Convert YUV to BGRA, store it back (first two lines) + yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); + + // Next two lines + yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y, + v_plane[buf_idx] + stride_vu, + u_plane[buf_idx] + stride_vu, + bgra + size_2lines_bgra, + width); + + // Wait for previous storing transfer to be completed + DMA_WAIT_TAG(STR_BUF); + spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); + ram_addr_bgra += size_2lines_bgra; + spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); + + // Wait for previous storing transfer to be completed + DMA_WAIT_TAG(STR_BUF); +} + + +/* Some vectors needed by the yuv 2 rgb conversion algorithm */ +const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f }; +const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 }; +const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 }; +const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B }; +const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F }; + +const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f}; +const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f}; +const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f}; +const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f}; + +const vector unsigned int vec_alpha = { 255 << 24, 255 << 24, 255 << 24, 255 << 24 }; + +const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 }; +const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F }; + + +#ifdef TESTING +/* + * yuv_to_rgb_w2() + * + * - converts x * 4 pixels from YUV to RGB + * - two lines of YUV are taken as input. + * - width has to be a multiple of 2 (= 4 pixel) + * + * @param y_addr address of the y plane (local store) + * @param v_addr address of the v plane (local store) + * @param u_addr address of the u plane (local store) + * @param bgra_addr_char address of the bgra output buffer (local store) + * @param width the width of a line in pixel + */ +void yuv_to_rgb_w2_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_char, unsigned int width) { + // each pixel is stored as an integer + unsigned int* bgra_addr = (unsigned int*) bgra_addr_char; + + unsigned int x; + // Go through each line in steps of 2, because every U and V value is connected to 4 pixels Y (YUV 4:2:0) + for(x = 0; x < width; x+=2) { + // Get the 4 Y, 1 U and 1 V values + const unsigned char Y_1 = *(y_addr + x); + const unsigned char Y_2 = *(y_addr + x + 1); + const unsigned char Y_3 = *(y_addr + x + width); + const unsigned char Y_4 = *(y_addr + x + width + 1); + const unsigned char U = *(u_addr + (x >> 1)); + const unsigned char V = *(v_addr + (x >> 1)); + + // Start converting + float V_minus_128 = (float)((float)V - 128.0f); + float U_minus_128 = (float)((float)U - 128.0f); + + float R_precalculate = 1.403f * V_minus_128; + float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128); + float B_precalculate = 1.773f * U_minus_128; + + // Cast the results + const unsigned char R_1 = float_to_char((Y_1 + R_precalculate)); + const unsigned char R_2 = float_to_char((Y_2 + R_precalculate)); + const unsigned char R_3 = float_to_char((Y_3 + R_precalculate)); + const unsigned char R_4 = float_to_char((Y_4 + R_precalculate)); + const unsigned char G_1 = float_to_char((Y_1 + G_precalculate)); + const unsigned char G_2 = float_to_char((Y_2 + G_precalculate)); + const unsigned char G_3 = float_to_char((Y_3 + G_precalculate)); + const unsigned char G_4 = float_to_char((Y_4 + G_precalculate)); + const unsigned char B_1 = float_to_char((Y_1 + B_precalculate)); + const unsigned char B_2 = float_to_char((Y_2 + B_precalculate)); + const unsigned char B_3 = float_to_char((Y_3 + B_precalculate)); + const unsigned char B_4 = float_to_char((Y_4 + B_precalculate)); + + // Write back + *(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24); + *(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24); + *(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24); + *(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24); + } +} +#endif + + +/* + * yuv_to_rgb_w32() + * + * processes to line of yuv-input, width has to be a multiple of 32 + * two lines of yuv are taken as input + * + * @param y_addr address of the y plane in local store + * @param v_addr address of the v plane in local store + * @param u_addr address of the u plane in local store + * @param bgra_addr_ address of the bgra output buffer + * @param width the width in pixel + */ +void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) { + // each pixel is stored as an integer + unsigned int* bgra_addr = (unsigned int*) bgra_addr_; + + unsigned int x; + for(x = 0; x < width; x+=32) { + // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt + + const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x)); + const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16)); + const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width)); + const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16)); + const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1))); + const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1))); + + const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128); + const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128); + const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128); + const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128); + + const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128); + const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128); + const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128); + const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128); + + vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0); + vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0); + vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0); + vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0); + vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0); + vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0); + vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0); + vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0); + vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0); + vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0); + vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0); + vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0); + vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0); + vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0); + vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0); + vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0); + + const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1); + const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2); + const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3); + const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4); + + const vector float R1_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_upper); + const vector float R2_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_lower); + const vector float R3_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_upper); + const vector float R4_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_lower); + const vector float R5_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_upper); + const vector float R6_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_lower); + const vector float R7_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_upper); + const vector float R8_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_lower); + + + const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff)); + const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff)); + const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff)); + const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff)); + + const vector float G1_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_upper); + const vector float G2_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_lower); + const vector float G3_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_upper); + const vector float G4_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_lower); + const vector float G5_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_upper); + const vector float G6_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_lower); + const vector float G7_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_upper); + const vector float G8_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_lower); + + + const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1); + const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2); + const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3); + const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4); + + const vector float B1_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_upper); + const vector float B2_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_lower); + const vector float B3_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_upper); + const vector float B4_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_lower); + const vector float B5_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_upper); + const vector float B6_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_lower); + const vector float B7_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_upper); + const vector float B8_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_lower); + + + const vector unsigned int R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate)); + const vector unsigned int R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate)); + const vector unsigned int R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate)); + const vector unsigned int R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate)); + const vector unsigned int R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate)); + const vector unsigned int R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate)); + const vector unsigned int R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate)); + const vector unsigned int R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate)); + const vector unsigned int R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate)); + const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate)); + const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate)); + const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate)); + const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate)); + const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate)); + const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate)); + const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate)); + + const vector unsigned int G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate)); + const vector unsigned int G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate)); + const vector unsigned int G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate)); + const vector unsigned int G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate)); + const vector unsigned int G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate)); + const vector unsigned int G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate)); + const vector unsigned int G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate)); + const vector unsigned int G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate)); + const vector unsigned int G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate)); + const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate)); + const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate)); + const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate)); + const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate)); + const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate)); + const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate)); + const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate)); + + const vector unsigned int B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate)); + const vector unsigned int B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate)); + const vector unsigned int B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate)); + const vector unsigned int B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate)); + const vector unsigned int B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate)); + const vector unsigned int B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate)); + const vector unsigned int B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate)); + const vector unsigned int B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate)); + const vector unsigned int B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate)); + const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate)); + const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate)); + const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate)); + const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate)); + const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate)); + const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate)); + const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate)); + + *((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha, B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1))); + *((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha, B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1))); + *((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha, B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1))); + *((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha, B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1))); + *((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha, B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1))); + *((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha, B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1))); + *((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha, B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1))); + *((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha, B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1))); + *((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha, B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1))); + *((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1))); + *((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1))); + *((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1))); + *((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1))); + *((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1))); + *((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1))); + *((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1))); + } +} +