Mercurial > sdl-ios-xcode
comparison src/video/ps3/spulibs/yuv2rgb.c @ 3257:94fb40a4a9a7
Merged Martin's code changes from Google Summer of Code 2009
author | Sam Lantinga <slouken@libsdl.org> |
---|---|
date | Mon, 07 Sep 2009 04:51:29 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
3256:83c87f2b2aab | 3257:94fb40a4a9a7 |
---|---|
1 /* | |
2 * SDL - Simple DirectMedia Layer | |
3 * CELL BE Support for PS3 Framebuffer | |
4 * Copyright (C) 2008, 2009 International Business Machines Corporation | |
5 * | |
6 * This library is free software; you can redistribute it and/or modify it | |
7 * under the terms of the GNU Lesser General Public License as published | |
8 * by the Free Software Foundation; either version 2.1 of the License, or | |
9 * (at your option) any later version. | |
10 * | |
11 * This library is distributed in the hope that it will be useful, but | |
12 * WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 * Lesser General Public License for more details. | |
15 * | |
16 * You should have received a copy of the GNU Lesser General Public | |
17 * License along with this library; if not, write to the Free Software | |
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 | |
19 * USA | |
20 * | |
21 * Martin Lowinski <lowinski [at] de [dot] ibm [ibm] com> | |
22 * Dirk Herrendoerfer <d.herrendoerfer [at] de [dot] ibm [dot] com> | |
23 * SPE code based on research by: | |
24 * Rene Becker | |
25 * Thimo Emmerich | |
26 */ | |
27 | |
28 #include "spu_common.h" | |
29 | |
30 #include <spu_intrinsics.h> | |
31 #include <spu_mfcio.h> | |
32 | |
33 // Debugging | |
34 //#define DEBUG | |
35 | |
36 // Test environment for /2 resolutions | |
37 //#define TESTING | |
38 | |
39 #ifdef DEBUG | |
40 #define deprintf(fmt, args... ) \ | |
41 fprintf( stdout, fmt, ##args ); \ | |
42 fflush( stdout ); | |
43 #else | |
44 #define deprintf( fmt, args... ) | |
45 #endif | |
46 | |
47 struct yuv2rgb_parms_t parms_converter __attribute__((aligned(128))); | |
48 | |
49 /* A maximum of 8 lines Y, therefore 4 lines V, 4 lines U are stored | |
50 * there might be the need to retrieve misaligned data, adjust | |
51 * incoming v and u plane to be able to handle this (add 128) | |
52 */ | |
53 unsigned char y_plane[2][(MAX_HDTV_WIDTH + 128) * 4] __attribute__((aligned(128))); | |
54 unsigned char v_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128))); | |
55 unsigned char u_plane[2][(MAX_HDTV_WIDTH + 128) * 2] __attribute__((aligned(128))); | |
56 | |
57 /* A maximum of 4 lines BGRA are stored, 4 byte per pixel */ | |
58 unsigned char bgra[4 * MAX_HDTV_WIDTH * 4] __attribute__((aligned(128))); | |
59 | |
60 /* some vectors needed by the float to int conversion */ | |
61 static const vector float vec_255 = { 255.0f, 255.0f, 255.0f, 255.0f }; | |
62 static const vector float vec_0_1 = { 0.1f, 0.1f, 0.1f, 0.1f }; | |
63 | |
64 void yuv_to_rgb_w16(); | |
65 void yuv_to_rgb_w32(); | |
66 | |
67 void yuv_to_rgb_w2_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr, unsigned int width); | |
68 void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width); | |
69 | |
70 | |
71 int main(unsigned long long spe_id __attribute__((unused)), unsigned long long argp __attribute__ ((unused))) | |
72 { | |
73 deprintf("[SPU] yuv2rgb_spu is up... (on SPE #%llu)\n", spe_id); | |
74 uint32_t ea_mfc, mbox; | |
75 // send ready message | |
76 spu_write_out_mbox(SPU_READY); | |
77 | |
78 while (1) { | |
79 /* Check mailbox */ | |
80 mbox = spu_read_in_mbox(); | |
81 deprintf("[SPU] Message is %u\n", mbox); | |
82 switch (mbox) { | |
83 case SPU_EXIT: | |
84 deprintf("[SPU] yuv2rgb_converter goes down...\n"); | |
85 return 0; | |
86 case SPU_START: | |
87 break; | |
88 default: | |
89 deprintf("[SPU] Cannot handle message\n"); | |
90 continue; | |
91 } | |
92 | |
93 /* Tag Manager setup */ | |
94 unsigned int tag_id; | |
95 tag_id = mfc_multi_tag_reserve(1); | |
96 if (tag_id == MFC_TAG_INVALID) { | |
97 deprintf("[SPU] Failed to reserve mfc tags on yuv2rgb_converter\n"); | |
98 return 0; | |
99 } | |
100 | |
101 /* DMA transfer for the input parameters */ | |
102 ea_mfc = spu_read_in_mbox(); | |
103 deprintf("[SPU] Message on yuv2rgb_converter is %u\n", ea_mfc); | |
104 spu_mfcdma32(&parms_converter, (unsigned int)ea_mfc, sizeof(struct yuv2rgb_parms_t), tag_id, MFC_GET_CMD); | |
105 DMA_WAIT_TAG(tag_id); | |
106 | |
107 /* There are alignment issues that involve handling of special cases | |
108 * a width of 32 results in a width of 16 in the chrominance | |
109 * --> choose the proper handling to optimize the performance | |
110 */ | |
111 deprintf("[SPU] Convert %ix%i from YUV to RGB\n", parms_converter.src_pixel_width, parms_converter.src_pixel_height); | |
112 if (!(parms_converter.src_pixel_width & 0x1f)) { | |
113 deprintf("[SPU] Using yuv_to_rgb_w16\n"); | |
114 yuv_to_rgb_w16(); | |
115 } else { | |
116 deprintf("[SPU] Using yuv_to_rgb_w32\n"); | |
117 yuv_to_rgb_w32(); | |
118 } | |
119 | |
120 mfc_multi_tag_release(tag_id, 1); | |
121 deprintf("[SPU] yuv2rgb_spu... done!\n"); | |
122 /* Send FIN message */ | |
123 spu_write_out_mbox(SPU_FIN); | |
124 } | |
125 | |
126 return 0; | |
127 } | |
128 | |
129 | |
130 /* | |
131 * float_to_char() | |
132 * | |
133 * converts a float to a character using saturated | |
134 * arithmetic | |
135 * | |
136 * @param s float for conversion | |
137 * @returns converted character | |
138 */ | |
139 inline static unsigned char float_to_char(float s) { | |
140 vector float vec_s = spu_splats(s); | |
141 vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); | |
142 vec_s = spu_sel(vec_s, vec_0_1, select_1); | |
143 | |
144 vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); | |
145 vec_s = spu_sel(vec_s, vec_255, select_2); | |
146 return (unsigned char) spu_extract(vec_s,0); | |
147 } | |
148 | |
149 | |
150 /* | |
151 * vfloat_to_vuint() | |
152 * | |
153 * converts a float vector to an unsinged int vector using saturated | |
154 * arithmetic | |
155 * | |
156 * @param vec_s float vector for conversion | |
157 * @returns converted unsigned int vector | |
158 */ | |
159 inline static vector unsigned int vfloat_to_vuint(vector float vec_s) { | |
160 vector unsigned int select_1 = spu_cmpgt(vec_0_1, vec_s); | |
161 vec_s = spu_sel(vec_s, vec_0_1, select_1); | |
162 | |
163 vector unsigned int select_2 = spu_cmpgt(vec_s, vec_255); | |
164 vec_s = spu_sel(vec_s, vec_255, select_2); | |
165 return spu_convtu(vec_s,0); | |
166 } | |
167 | |
168 | |
169 void yuv_to_rgb_w16() { | |
170 // Pixel dimensions of the picture | |
171 uint32_t width, height; | |
172 | |
173 // Extract parameters | |
174 width = parms_converter.src_pixel_width; | |
175 height = parms_converter.src_pixel_height; | |
176 | |
177 // Plane data management | |
178 // Y | |
179 unsigned char* ram_addr_y = parms_converter.y_plane; | |
180 // V | |
181 unsigned char* ram_addr_v = parms_converter.v_plane; | |
182 // U | |
183 unsigned char* ram_addr_u = parms_converter.u_plane; | |
184 | |
185 // BGRA | |
186 unsigned char* ram_addr_bgra = parms_converter.dstBuffer; | |
187 | |
188 // Strides | |
189 unsigned int stride_y = width; | |
190 unsigned int stride_vu = width>>1; | |
191 | |
192 // Buffer management | |
193 unsigned int buf_idx = 0; | |
194 unsigned int size_4lines_y = stride_y<<2; | |
195 unsigned int size_2lines_y = stride_y<<1; | |
196 unsigned int size_2lines_vu = stride_vu<<1; | |
197 | |
198 // 2*width*4byte_per_pixel | |
199 unsigned int size_2lines_bgra = width<<3; | |
200 | |
201 | |
202 // start double-buffered processing | |
203 // 4 lines y | |
204 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD); | |
205 | |
206 // 2 lines v | |
207 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); | |
208 | |
209 // 2 lines u | |
210 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); | |
211 | |
212 // Wait for these transfers to be completed | |
213 DMA_WAIT_TAG((RETR_BUF + buf_idx)); | |
214 | |
215 unsigned int i; | |
216 for(i=0; i<(height>>2)-1; i++) { | |
217 | |
218 buf_idx^=1; | |
219 | |
220 // 4 lines y | |
221 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF+buf_idx, MFC_GET_CMD); | |
222 | |
223 // 2 lines v | |
224 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); | |
225 | |
226 // 2 lines u | |
227 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF+buf_idx, MFC_GET_CMD); | |
228 | |
229 DMA_WAIT_TAG((RETR_BUF + buf_idx)); | |
230 | |
231 buf_idx^=1; | |
232 | |
233 | |
234 // Convert YUV to BGRA, store it back (first two lines) | |
235 #ifndef TESTING | |
236 yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); | |
237 | |
238 // Next two lines | |
239 yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y, | |
240 v_plane[buf_idx] + stride_vu, | |
241 u_plane[buf_idx] + stride_vu, | |
242 bgra + size_2lines_bgra, | |
243 width); | |
244 #else | |
245 yuv_to_rgb_w2_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); | |
246 | |
247 // Next two lines | |
248 yuv_to_rgb_w2_line(y_plane[buf_idx] + size_2lines_y, | |
249 v_plane[buf_idx] + stride_vu, | |
250 u_plane[buf_idx] + stride_vu, | |
251 bgra + size_2lines_bgra, | |
252 width); | |
253 #endif | |
254 | |
255 // Wait for previous storing transfer to be completed | |
256 DMA_WAIT_TAG(STR_BUF); | |
257 | |
258 // Store converted lines in two steps->max transfer size 16384 | |
259 spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); | |
260 ram_addr_bgra += size_2lines_bgra; | |
261 spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); | |
262 ram_addr_bgra += size_2lines_bgra; | |
263 | |
264 // Move 4 lines | |
265 ram_addr_y += size_4lines_y; | |
266 ram_addr_v += size_2lines_vu; | |
267 ram_addr_u += size_2lines_vu; | |
268 | |
269 buf_idx^=1; | |
270 } | |
271 | |
272 #ifndef TESTING | |
273 // Convert YUV to BGRA, store it back (first two lines) | |
274 yuv_to_rgb_w16_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); | |
275 | |
276 // Next two lines | |
277 yuv_to_rgb_w16_line(y_plane[buf_idx] + size_2lines_y, | |
278 v_plane[buf_idx] + stride_vu, | |
279 u_plane[buf_idx] + stride_vu, | |
280 bgra + size_2lines_bgra, | |
281 width); | |
282 #else | |
283 // Convert YUV to BGRA, store it back (first two lines) | |
284 yuv_to_rgb_w2_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); | |
285 | |
286 // Next two lines | |
287 yuv_to_rgb_w2_line(y_plane[buf_idx] + size_2lines_y, | |
288 v_plane[buf_idx] + stride_vu, | |
289 u_plane[buf_idx] + stride_vu, | |
290 bgra + size_2lines_bgra, | |
291 width); | |
292 #endif | |
293 | |
294 // Wait for previous storing transfer to be completed | |
295 DMA_WAIT_TAG(STR_BUF); | |
296 spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); | |
297 ram_addr_bgra += size_2lines_bgra; | |
298 spu_mfcdma32(bgra+size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); | |
299 | |
300 // wait for previous storing transfer to be completed | |
301 DMA_WAIT_TAG(STR_BUF); | |
302 | |
303 } | |
304 | |
305 | |
306 void yuv_to_rgb_w32() { | |
307 // Pixel dimensions of the picture | |
308 uint32_t width, height; | |
309 | |
310 // Extract parameters | |
311 width = parms_converter.src_pixel_width; | |
312 height = parms_converter.src_pixel_height; | |
313 | |
314 // Plane data management | |
315 // Y | |
316 unsigned char* ram_addr_y = parms_converter.y_plane; | |
317 // V | |
318 unsigned char* ram_addr_v = parms_converter.v_plane; | |
319 // U | |
320 unsigned char* ram_addr_u = parms_converter.u_plane; | |
321 | |
322 // BGRA | |
323 unsigned char* ram_addr_bgra = parms_converter.dstBuffer; | |
324 | |
325 // Strides | |
326 unsigned int stride_y = width; | |
327 unsigned int stride_vu = width>>1; | |
328 | |
329 // Buffer management | |
330 unsigned int buf_idx = 0; | |
331 unsigned int size_4lines_y = stride_y<<2; | |
332 unsigned int size_2lines_y = stride_y<<1; | |
333 unsigned int size_2lines_vu = stride_vu<<1; | |
334 | |
335 // 2*width*4byte_per_pixel | |
336 unsigned int size_2lines_bgra = width<<3; | |
337 | |
338 // start double-buffered processing | |
339 // 4 lines y | |
340 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD); | |
341 // 2 lines v | |
342 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); | |
343 // 2 lines u | |
344 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); | |
345 | |
346 // Wait for these transfers to be completed | |
347 DMA_WAIT_TAG((RETR_BUF + buf_idx)); | |
348 | |
349 unsigned int i; | |
350 for(i=0; i < (height>>2)-1; i++) { | |
351 buf_idx^=1; | |
352 // 4 lines y | |
353 spu_mfcdma32(y_plane[buf_idx], (unsigned int) ram_addr_y+size_4lines_y, size_4lines_y, RETR_BUF + buf_idx, MFC_GET_CMD); | |
354 deprintf("4lines = %d\n", size_4lines_y); | |
355 // 2 lines v | |
356 spu_mfcdma32(v_plane[buf_idx], (unsigned int) ram_addr_v+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); | |
357 deprintf("2lines = %d\n", size_2lines_vu); | |
358 // 2 lines u | |
359 spu_mfcdma32(u_plane[buf_idx], (unsigned int) ram_addr_u+size_2lines_vu, size_2lines_vu, RETR_BUF + buf_idx, MFC_GET_CMD); | |
360 deprintf("2lines = %d\n", size_2lines_vu); | |
361 | |
362 DMA_WAIT_TAG((RETR_BUF + buf_idx)); | |
363 | |
364 buf_idx^=1; | |
365 | |
366 // Convert YUV to BGRA, store it back (first two lines) | |
367 yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); | |
368 | |
369 // Next two lines | |
370 yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y, | |
371 v_plane[buf_idx] + stride_vu, | |
372 u_plane[buf_idx] + stride_vu, | |
373 bgra + size_2lines_bgra, | |
374 width); | |
375 | |
376 // Wait for previous storing transfer to be completed | |
377 DMA_WAIT_TAG(STR_BUF); | |
378 | |
379 // Store converted lines in two steps->max transfer size 16384 | |
380 spu_mfcdma32(bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); | |
381 ram_addr_bgra += size_2lines_bgra; | |
382 spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int)ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); | |
383 ram_addr_bgra += size_2lines_bgra; | |
384 | |
385 // Move 4 lines | |
386 ram_addr_y += size_4lines_y; | |
387 ram_addr_v += size_2lines_vu; | |
388 ram_addr_u += size_2lines_vu; | |
389 | |
390 buf_idx^=1; | |
391 } | |
392 | |
393 // Convert YUV to BGRA, store it back (first two lines) | |
394 yuv_to_rgb_w32_line(y_plane[buf_idx], v_plane[buf_idx], u_plane[buf_idx], bgra, width); | |
395 | |
396 // Next two lines | |
397 yuv_to_rgb_w32_line(y_plane[buf_idx] + size_2lines_y, | |
398 v_plane[buf_idx] + stride_vu, | |
399 u_plane[buf_idx] + stride_vu, | |
400 bgra + size_2lines_bgra, | |
401 width); | |
402 | |
403 // Wait for previous storing transfer to be completed | |
404 DMA_WAIT_TAG(STR_BUF); | |
405 spu_mfcdma32(bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); | |
406 ram_addr_bgra += size_2lines_bgra; | |
407 spu_mfcdma32(bgra + size_2lines_bgra, (unsigned int) ram_addr_bgra, size_2lines_bgra, STR_BUF, MFC_PUT_CMD); | |
408 | |
409 // Wait for previous storing transfer to be completed | |
410 DMA_WAIT_TAG(STR_BUF); | |
411 } | |
412 | |
413 | |
414 /* Some vectors needed by the yuv 2 rgb conversion algorithm */ | |
415 const vector float vec_minus_128 = { -128.0f, -128.0f, -128.0f, -128.0f }; | |
416 const vector unsigned char vec_null = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; | |
417 const vector unsigned char vec_char2int_first = { 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x13 }; | |
418 const vector unsigned char vec_char2int_second = { 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x17 }; | |
419 const vector unsigned char vec_char2int_third = { 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1A, 0x00, 0x00, 0x00, 0x1B }; | |
420 const vector unsigned char vec_char2int_fourth = { 0x00, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x00, 0x1D, 0x00, 0x00, 0x00, 0x1E, 0x00, 0x00, 0x00, 0x1F }; | |
421 | |
422 const vector float vec_R_precalc_coeff = {1.403f, 1.403f, 1.403f, 1.403f}; | |
423 const vector float vec_Gu_precalc_coeff = {-0.344f, -0.344f, -0.344f, -0.344f}; | |
424 const vector float vec_Gv_precalc_coeff = {-0.714f, -0.714f, -0.714f, -0.714f}; | |
425 const vector float vec_B_precalc_coeff = {1.773f, 1.773f, 1.773f, 1.773f}; | |
426 | |
427 const vector unsigned int vec_alpha = { 255 << 24, 255 << 24, 255 << 24, 255 << 24 }; | |
428 | |
429 const vector unsigned char vec_select_floats_upper = { 0x00, 0x01, 0x02, 0x03, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x04, 0x05, 0x06, 0x07 }; | |
430 const vector unsigned char vec_select_floats_lower = { 0x08, 0x09, 0x0A, 0x0B, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x0C, 0x0D, 0x0E, 0x0F }; | |
431 | |
432 | |
433 #ifdef TESTING | |
434 /* | |
435 * yuv_to_rgb_w2() | |
436 * | |
437 * - converts x * 4 pixels from YUV to RGB | |
438 * - two lines of YUV are taken as input. | |
439 * - width has to be a multiple of 2 (= 4 pixel) | |
440 * | |
441 * @param y_addr address of the y plane (local store) | |
442 * @param v_addr address of the v plane (local store) | |
443 * @param u_addr address of the u plane (local store) | |
444 * @param bgra_addr_char address of the bgra output buffer (local store) | |
445 * @param width the width of a line in pixel | |
446 */ | |
447 void yuv_to_rgb_w2_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_char, unsigned int width) { | |
448 // each pixel is stored as an integer | |
449 unsigned int* bgra_addr = (unsigned int*) bgra_addr_char; | |
450 | |
451 unsigned int x; | |
452 // Go through each line in steps of 2, because every U and V value is connected to 4 pixels Y (YUV 4:2:0) | |
453 for(x = 0; x < width; x+=2) { | |
454 // Get the 4 Y, 1 U and 1 V values | |
455 const unsigned char Y_1 = *(y_addr + x); | |
456 const unsigned char Y_2 = *(y_addr + x + 1); | |
457 const unsigned char Y_3 = *(y_addr + x + width); | |
458 const unsigned char Y_4 = *(y_addr + x + width + 1); | |
459 const unsigned char U = *(u_addr + (x >> 1)); | |
460 const unsigned char V = *(v_addr + (x >> 1)); | |
461 | |
462 // Start converting | |
463 float V_minus_128 = (float)((float)V - 128.0f); | |
464 float U_minus_128 = (float)((float)U - 128.0f); | |
465 | |
466 float R_precalculate = 1.403f * V_minus_128; | |
467 float G_precalculate = -(0.344f * U_minus_128 + 0.714f * V_minus_128); | |
468 float B_precalculate = 1.773f * U_minus_128; | |
469 | |
470 // Cast the results | |
471 const unsigned char R_1 = float_to_char((Y_1 + R_precalculate)); | |
472 const unsigned char R_2 = float_to_char((Y_2 + R_precalculate)); | |
473 const unsigned char R_3 = float_to_char((Y_3 + R_precalculate)); | |
474 const unsigned char R_4 = float_to_char((Y_4 + R_precalculate)); | |
475 const unsigned char G_1 = float_to_char((Y_1 + G_precalculate)); | |
476 const unsigned char G_2 = float_to_char((Y_2 + G_precalculate)); | |
477 const unsigned char G_3 = float_to_char((Y_3 + G_precalculate)); | |
478 const unsigned char G_4 = float_to_char((Y_4 + G_precalculate)); | |
479 const unsigned char B_1 = float_to_char((Y_1 + B_precalculate)); | |
480 const unsigned char B_2 = float_to_char((Y_2 + B_precalculate)); | |
481 const unsigned char B_3 = float_to_char((Y_3 + B_precalculate)); | |
482 const unsigned char B_4 = float_to_char((Y_4 + B_precalculate)); | |
483 | |
484 // Write back | |
485 *(bgra_addr + x) = (B_1 << 0)| (G_1 << 8) | (R_1 << 16) | (255 << 24); | |
486 *(bgra_addr + x + 1) = (B_2 << 0)| (G_2 << 8) | (R_2 << 16) | (255 << 24); | |
487 *(bgra_addr + x + width) = (B_3 << 0)| (G_3 << 8) | (R_3 << 16) | (255 << 24); | |
488 *(bgra_addr + x + width + 1) = (B_4 << 0)| (G_4 << 8) | (R_4 << 16) | (255 << 24); | |
489 } | |
490 } | |
491 #endif | |
492 | |
493 | |
494 /* | |
495 * yuv_to_rgb_w32() | |
496 * | |
497 * processes to line of yuv-input, width has to be a multiple of 32 | |
498 * two lines of yuv are taken as input | |
499 * | |
500 * @param y_addr address of the y plane in local store | |
501 * @param v_addr address of the v plane in local store | |
502 * @param u_addr address of the u plane in local store | |
503 * @param bgra_addr_ address of the bgra output buffer | |
504 * @param width the width in pixel | |
505 */ | |
506 void yuv_to_rgb_w32_line(unsigned char* y_addr, unsigned char* v_addr, unsigned char* u_addr, unsigned char* bgra_addr_, unsigned int width) { | |
507 // each pixel is stored as an integer | |
508 unsigned int* bgra_addr = (unsigned int*) bgra_addr_; | |
509 | |
510 unsigned int x; | |
511 for(x = 0; x < width; x+=32) { | |
512 // Gehe zweischrittig durch die zeile, da jeder u und v wert fuer 4 pixel(zwei hoch, zwei breit) gilt | |
513 | |
514 const vector unsigned char vchar_Y_1 = *((vector unsigned char*)(y_addr + x)); | |
515 const vector unsigned char vchar_Y_2 = *((vector unsigned char*)(y_addr + x + 16)); | |
516 const vector unsigned char vchar_Y_3 = *((vector unsigned char*)(y_addr + x + width)); | |
517 const vector unsigned char vchar_Y_4 = *((vector unsigned char*)(y_addr + x + width + 16)); | |
518 const vector unsigned char vchar_U = *((vector unsigned char*)(u_addr + (x >> 1))); | |
519 const vector unsigned char vchar_V = *((vector unsigned char*)(v_addr + (x >> 1))); | |
520 | |
521 const vector float vfloat_U_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_first), 0),vec_minus_128); | |
522 const vector float vfloat_U_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_second), 0),vec_minus_128); | |
523 const vector float vfloat_U_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_third), 0),vec_minus_128); | |
524 const vector float vfloat_U_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_U, vec_char2int_fourth), 0),vec_minus_128); | |
525 | |
526 const vector float vfloat_V_1 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_first), 0),vec_minus_128); | |
527 const vector float vfloat_V_2 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_second), 0),vec_minus_128); | |
528 const vector float vfloat_V_3 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_third), 0),vec_minus_128); | |
529 const vector float vfloat_V_4 = spu_add(spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_V, vec_char2int_fourth), 0),vec_minus_128); | |
530 | |
531 vector float Y_1 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_first), 0); | |
532 vector float Y_2 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_second), 0); | |
533 vector float Y_3 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_third), 0); | |
534 vector float Y_4 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_1, vec_char2int_fourth), 0); | |
535 vector float Y_5 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_first), 0); | |
536 vector float Y_6 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_second), 0); | |
537 vector float Y_7 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_third), 0); | |
538 vector float Y_8 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_2, vec_char2int_fourth), 0); | |
539 vector float Y_9 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_first), 0); | |
540 vector float Y_10 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_second), 0); | |
541 vector float Y_11 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_third), 0); | |
542 vector float Y_12 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_3, vec_char2int_fourth), 0); | |
543 vector float Y_13 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_first), 0); | |
544 vector float Y_14 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_second), 0); | |
545 vector float Y_15 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_third), 0); | |
546 vector float Y_16 = spu_convtf((vector unsigned int)spu_shuffle(vec_null, vchar_Y_4, vec_char2int_fourth), 0); | |
547 | |
548 const vector float R1a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_1); | |
549 const vector float R2a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_2); | |
550 const vector float R3a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_3); | |
551 const vector float R4a_precalculate = spu_mul(vec_R_precalc_coeff, vfloat_V_4); | |
552 | |
553 const vector float R1_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_upper); | |
554 const vector float R2_precalculate = spu_shuffle(R1a_precalculate, R1a_precalculate, vec_select_floats_lower); | |
555 const vector float R3_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_upper); | |
556 const vector float R4_precalculate = spu_shuffle(R2a_precalculate, R2a_precalculate, vec_select_floats_lower); | |
557 const vector float R5_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_upper); | |
558 const vector float R6_precalculate = spu_shuffle(R3a_precalculate, R3a_precalculate, vec_select_floats_lower); | |
559 const vector float R7_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_upper); | |
560 const vector float R8_precalculate = spu_shuffle(R4a_precalculate, R4a_precalculate, vec_select_floats_lower); | |
561 | |
562 | |
563 const vector float G1a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_1, spu_mul(vfloat_V_1, vec_Gv_precalc_coeff)); | |
564 const vector float G2a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_2, spu_mul(vfloat_V_2, vec_Gv_precalc_coeff)); | |
565 const vector float G3a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_3, spu_mul(vfloat_V_3, vec_Gv_precalc_coeff)); | |
566 const vector float G4a_precalculate = spu_madd(vec_Gu_precalc_coeff, vfloat_U_4, spu_mul(vfloat_V_4, vec_Gv_precalc_coeff)); | |
567 | |
568 const vector float G1_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_upper); | |
569 const vector float G2_precalculate = spu_shuffle(G1a_precalculate, G1a_precalculate, vec_select_floats_lower); | |
570 const vector float G3_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_upper); | |
571 const vector float G4_precalculate = spu_shuffle(G2a_precalculate, G2a_precalculate, vec_select_floats_lower); | |
572 const vector float G5_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_upper); | |
573 const vector float G6_precalculate = spu_shuffle(G3a_precalculate, G3a_precalculate, vec_select_floats_lower); | |
574 const vector float G7_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_upper); | |
575 const vector float G8_precalculate = spu_shuffle(G4a_precalculate, G4a_precalculate, vec_select_floats_lower); | |
576 | |
577 | |
578 const vector float B1a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_1); | |
579 const vector float B2a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_2); | |
580 const vector float B3a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_3); | |
581 const vector float B4a_precalculate = spu_mul(vec_B_precalc_coeff, vfloat_U_4); | |
582 | |
583 const vector float B1_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_upper); | |
584 const vector float B2_precalculate = spu_shuffle(B1a_precalculate, B1a_precalculate, vec_select_floats_lower); | |
585 const vector float B3_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_upper); | |
586 const vector float B4_precalculate = spu_shuffle(B2a_precalculate, B2a_precalculate, vec_select_floats_lower); | |
587 const vector float B5_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_upper); | |
588 const vector float B6_precalculate = spu_shuffle(B3a_precalculate, B3a_precalculate, vec_select_floats_lower); | |
589 const vector float B7_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_upper); | |
590 const vector float B8_precalculate = spu_shuffle(B4a_precalculate, B4a_precalculate, vec_select_floats_lower); | |
591 | |
592 | |
593 const vector unsigned int R_1 = vfloat_to_vuint(spu_add( Y_1, R1_precalculate)); | |
594 const vector unsigned int R_2 = vfloat_to_vuint(spu_add( Y_2, R2_precalculate)); | |
595 const vector unsigned int R_3 = vfloat_to_vuint(spu_add( Y_3, R3_precalculate)); | |
596 const vector unsigned int R_4 = vfloat_to_vuint(spu_add( Y_4, R4_precalculate)); | |
597 const vector unsigned int R_5 = vfloat_to_vuint(spu_add( Y_5, R5_precalculate)); | |
598 const vector unsigned int R_6 = vfloat_to_vuint(spu_add( Y_6, R6_precalculate)); | |
599 const vector unsigned int R_7 = vfloat_to_vuint(spu_add( Y_7, R7_precalculate)); | |
600 const vector unsigned int R_8 = vfloat_to_vuint(spu_add( Y_8, R8_precalculate)); | |
601 const vector unsigned int R_9 = vfloat_to_vuint(spu_add( Y_9, R1_precalculate)); | |
602 const vector unsigned int R_10 = vfloat_to_vuint(spu_add(Y_10, R2_precalculate)); | |
603 const vector unsigned int R_11 = vfloat_to_vuint(spu_add(Y_11, R3_precalculate)); | |
604 const vector unsigned int R_12 = vfloat_to_vuint(spu_add(Y_12, R4_precalculate)); | |
605 const vector unsigned int R_13 = vfloat_to_vuint(spu_add(Y_13, R5_precalculate)); | |
606 const vector unsigned int R_14 = vfloat_to_vuint(spu_add(Y_14, R6_precalculate)); | |
607 const vector unsigned int R_15 = vfloat_to_vuint(spu_add(Y_15, R7_precalculate)); | |
608 const vector unsigned int R_16 = vfloat_to_vuint(spu_add(Y_16, R8_precalculate)); | |
609 | |
610 const vector unsigned int G_1 = vfloat_to_vuint(spu_add( Y_1, G1_precalculate)); | |
611 const vector unsigned int G_2 = vfloat_to_vuint(spu_add( Y_2, G2_precalculate)); | |
612 const vector unsigned int G_3 = vfloat_to_vuint(spu_add( Y_3, G3_precalculate)); | |
613 const vector unsigned int G_4 = vfloat_to_vuint(spu_add( Y_4, G4_precalculate)); | |
614 const vector unsigned int G_5 = vfloat_to_vuint(spu_add( Y_5, G5_precalculate)); | |
615 const vector unsigned int G_6 = vfloat_to_vuint(spu_add( Y_6, G6_precalculate)); | |
616 const vector unsigned int G_7 = vfloat_to_vuint(spu_add( Y_7, G7_precalculate)); | |
617 const vector unsigned int G_8 = vfloat_to_vuint(spu_add( Y_8, G8_precalculate)); | |
618 const vector unsigned int G_9 = vfloat_to_vuint(spu_add( Y_9, G1_precalculate)); | |
619 const vector unsigned int G_10 = vfloat_to_vuint(spu_add(Y_10, G2_precalculate)); | |
620 const vector unsigned int G_11 = vfloat_to_vuint(spu_add(Y_11, G3_precalculate)); | |
621 const vector unsigned int G_12 = vfloat_to_vuint(spu_add(Y_12, G4_precalculate)); | |
622 const vector unsigned int G_13 = vfloat_to_vuint(spu_add(Y_13, G5_precalculate)); | |
623 const vector unsigned int G_14 = vfloat_to_vuint(spu_add(Y_14, G6_precalculate)); | |
624 const vector unsigned int G_15 = vfloat_to_vuint(spu_add(Y_15, G7_precalculate)); | |
625 const vector unsigned int G_16 = vfloat_to_vuint(spu_add(Y_16, G8_precalculate)); | |
626 | |
627 const vector unsigned int B_1 = vfloat_to_vuint(spu_add( Y_1, B1_precalculate)); | |
628 const vector unsigned int B_2 = vfloat_to_vuint(spu_add( Y_2, B2_precalculate)); | |
629 const vector unsigned int B_3 = vfloat_to_vuint(spu_add( Y_3, B3_precalculate)); | |
630 const vector unsigned int B_4 = vfloat_to_vuint(spu_add( Y_4, B4_precalculate)); | |
631 const vector unsigned int B_5 = vfloat_to_vuint(spu_add( Y_5, B5_precalculate)); | |
632 const vector unsigned int B_6 = vfloat_to_vuint(spu_add( Y_6, B6_precalculate)); | |
633 const vector unsigned int B_7 = vfloat_to_vuint(spu_add( Y_7, B7_precalculate)); | |
634 const vector unsigned int B_8 = vfloat_to_vuint(spu_add( Y_8, B8_precalculate)); | |
635 const vector unsigned int B_9 = vfloat_to_vuint(spu_add( Y_9, B1_precalculate)); | |
636 const vector unsigned int B_10 = vfloat_to_vuint(spu_add(Y_10, B2_precalculate)); | |
637 const vector unsigned int B_11 = vfloat_to_vuint(spu_add(Y_11, B3_precalculate)); | |
638 const vector unsigned int B_12 = vfloat_to_vuint(spu_add(Y_12, B4_precalculate)); | |
639 const vector unsigned int B_13 = vfloat_to_vuint(spu_add(Y_13, B5_precalculate)); | |
640 const vector unsigned int B_14 = vfloat_to_vuint(spu_add(Y_14, B6_precalculate)); | |
641 const vector unsigned int B_15 = vfloat_to_vuint(spu_add(Y_15, B7_precalculate)); | |
642 const vector unsigned int B_16 = vfloat_to_vuint(spu_add(Y_16, B8_precalculate)); | |
643 | |
644 *((vector unsigned int*)(bgra_addr + x)) = spu_or(spu_or(vec_alpha, B_1), spu_or(spu_slqwbyte( R_1, 2),spu_slqwbyte(G_1, 1))); | |
645 *((vector unsigned int*)(bgra_addr + x + 4)) = spu_or(spu_or(vec_alpha, B_2), spu_or(spu_slqwbyte( R_2, 2),spu_slqwbyte(G_2, 1))); | |
646 *((vector unsigned int*)(bgra_addr + x + 8)) = spu_or(spu_or(vec_alpha, B_3), spu_or(spu_slqwbyte( R_3, 2),spu_slqwbyte(G_3, 1))); | |
647 *((vector unsigned int*)(bgra_addr + x + 12)) = spu_or(spu_or(vec_alpha, B_4), spu_or(spu_slqwbyte( R_4, 2),spu_slqwbyte(G_4, 1))); | |
648 *((vector unsigned int*)(bgra_addr + x + 16)) = spu_or(spu_or(vec_alpha, B_5), spu_or(spu_slqwbyte( R_5, 2),spu_slqwbyte(G_5, 1))); | |
649 *((vector unsigned int*)(bgra_addr + x + 20)) = spu_or(spu_or(vec_alpha, B_6), spu_or(spu_slqwbyte( R_6, 2),spu_slqwbyte(G_6, 1))); | |
650 *((vector unsigned int*)(bgra_addr + x + 24)) = spu_or(spu_or(vec_alpha, B_7), spu_or(spu_slqwbyte( R_7, 2),spu_slqwbyte(G_7, 1))); | |
651 *((vector unsigned int*)(bgra_addr + x + 28)) = spu_or(spu_or(vec_alpha, B_8), spu_or(spu_slqwbyte( R_8, 2),spu_slqwbyte(G_8, 1))); | |
652 *((vector unsigned int*)(bgra_addr + x + width)) = spu_or(spu_or(vec_alpha, B_9), spu_or(spu_slqwbyte( R_9, 2),spu_slqwbyte(G_9, 1))); | |
653 *((vector unsigned int*)(bgra_addr + x + width + 4)) = spu_or(spu_or(vec_alpha, B_10), spu_or(spu_slqwbyte(R_10, 2),spu_slqwbyte(G_10, 1))); | |
654 *((vector unsigned int*)(bgra_addr + x + width + 8)) = spu_or(spu_or(vec_alpha, B_11), spu_or(spu_slqwbyte(R_11, 2),spu_slqwbyte(G_11, 1))); | |
655 *((vector unsigned int*)(bgra_addr + x + width + 12)) = spu_or(spu_or(vec_alpha, B_12), spu_or(spu_slqwbyte(R_12, 2),spu_slqwbyte(G_12, 1))); | |
656 *((vector unsigned int*)(bgra_addr + x + width + 16)) = spu_or(spu_or(vec_alpha, B_13), spu_or(spu_slqwbyte(R_13, 2),spu_slqwbyte(G_13, 1))); | |
657 *((vector unsigned int*)(bgra_addr + x + width + 20)) = spu_or(spu_or(vec_alpha, B_14), spu_or(spu_slqwbyte(R_14, 2),spu_slqwbyte(G_14, 1))); | |
658 *((vector unsigned int*)(bgra_addr + x + width + 24)) = spu_or(spu_or(vec_alpha, B_15), spu_or(spu_slqwbyte(R_15, 2),spu_slqwbyte(G_15, 1))); | |
659 *((vector unsigned int*)(bgra_addr + x + width + 28)) = spu_or(spu_or(vec_alpha, B_16), spu_or(spu_slqwbyte(R_16, 2),spu_slqwbyte(G_16, 1))); | |
660 } | |
661 } | |
662 |