comparison src/video/SDL_yuv_mmx.c @ 4045:f420bba13676 SDL-1.2

GCC inline asm for MMX YUV processing no longer has textrels and now works when gcc wants to hog %%ebx for the PIC register. Fixes Bugzilla #418.
author Ryan C. Gordon <icculus@icculus.org>
date Wed, 11 Jul 2007 06:26:22 +0000
parents 40edc79b0926
children 3a9e60224efe
comparison
equal deleted inserted replaced
4044:009d85e98922 4045:f420bba13676
19 Sam Lantinga 19 Sam Lantinga
20 slouken@libsdl.org 20 slouken@libsdl.org
21 */ 21 */
22 #include "SDL_config.h" 22 #include "SDL_config.h"
23 23
24 #if 0 /* FIXME: This code needs to be rewritten to reference the static data using relocatable addresses (e.g. http://www.gentoo.org/proj/en/hardened/pic-fix-guide.xml or http://nasm.sourceforge.net/doc/html/nasmdoc8.html#section-8.2) This code currently breaks on systems with readonly text segments (hardened Linux / Intel Mac) */ 24 #if (__GNUC__ > 2) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
25 #if defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
26 25
27 #include "SDL_stdinc.h" 26 #include "SDL_stdinc.h"
28 27
29 #define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used)) 28 #include "mmx.h"
30 29
31 static unsigned int ASM_ARRAY(MMX_0080w) = {0x00800080, 0x00800080}; 30 static mmx_t MMX_0080w = { .ud = {0x00800080, 0x00800080} };
32 static unsigned int ASM_ARRAY(MMX_00FFw) = {0x00ff00ff, 0x00ff00ff}; 31 static mmx_t MMX_00FFw = { .ud = {0x00ff00ff, 0x00ff00ff} };
33 static unsigned int ASM_ARRAY(MMX_FF00w) = {0xff00ff00, 0xff00ff00}; 32 static mmx_t MMX_FF00w = { .ud = {0xff00ff00, 0xff00ff00} };
34 33
35 static unsigned short ASM_ARRAY(MMX_Ycoeff) = {0x004a, 0x004a, 0x004a, 0x004a}; 34 static mmx_t MMX_Ycoeff = { .uw = {0x004a, 0x004a, 0x004a, 0x004a} };
36 35
37 static unsigned short ASM_ARRAY(MMX_UbluRGB) = {0x0072, 0x0072, 0x0072, 0x0072}; 36 static mmx_t MMX_UbluRGB = { .uw = {0x0072, 0x0072, 0x0072, 0x0072} };
38 static unsigned short ASM_ARRAY(MMX_VredRGB) = {0x0059, 0x0059, 0x0059, 0x0059}; 37 static mmx_t MMX_VredRGB = { .uw = {0x0059, 0x0059, 0x0059, 0x0059} };
39 static unsigned short ASM_ARRAY(MMX_UgrnRGB) = {0xffea, 0xffea, 0xffea, 0xffea}; 38 static mmx_t MMX_UgrnRGB = { .uw = {0xffea, 0xffea, 0xffea, 0xffea} };
40 static unsigned short ASM_ARRAY(MMX_VgrnRGB) = {0xffd2, 0xffd2, 0xffd2, 0xffd2}; 39 static mmx_t MMX_VgrnRGB = { .uw = {0xffd2, 0xffd2, 0xffd2, 0xffd2} };
41 40
42 static unsigned short ASM_ARRAY(MMX_Ublu5x5) = {0x0081, 0x0081, 0x0081, 0x0081}; 41 static mmx_t MMX_Ublu5x5 = { .uw = {0x0081, 0x0081, 0x0081, 0x0081} };
43 static unsigned short ASM_ARRAY(MMX_Vred5x5) = {0x0066, 0x0066, 0x0066, 0x0066}; 42 static mmx_t MMX_Vred5x5 = { .uw = {0x0066, 0x0066, 0x0066, 0x0066} };
44 static unsigned short ASM_ARRAY(MMX_Ugrn555) = {0xffe7, 0xffe7, 0xffe7, 0xffe7}; 43 static mmx_t MMX_Ugrn555 = { .uw = {0xffe7, 0xffe7, 0xffe7, 0xffe7} };
45 static unsigned short ASM_ARRAY(MMX_Vgrn555) = {0xffcc, 0xffcc, 0xffcc, 0xffcc}; 44 static mmx_t MMX_Vgrn555 = { .uw = {0xffcc, 0xffcc, 0xffcc, 0xffcc} };
46 static unsigned short ASM_ARRAY(MMX_Ugrn565) = {0xffe8, 0xffe8, 0xffe8, 0xffe8}; 45 static mmx_t MMX_Ugrn565 = { .uw = {0xffe8, 0xffe8, 0xffe8, 0xffe8} };
47 static unsigned short ASM_ARRAY(MMX_Vgrn565) = {0xffcd, 0xffcd, 0xffcd, 0xffcd}; 46 static mmx_t MMX_Vgrn565 = { .uw = {0xffcd, 0xffcd, 0xffcd, 0xffcd} };
48 47
49 static unsigned short ASM_ARRAY(MMX_red555) = {0x7c00, 0x7c00, 0x7c00, 0x7c00}; 48 static mmx_t MMX_red555 = { .uw = {0x7c00, 0x7c00, 0x7c00, 0x7c00} };
50 static unsigned short ASM_ARRAY(MMX_red565) = {0xf800, 0xf800, 0xf800, 0xf800}; 49 static mmx_t MMX_red565 = { .uw = {0xf800, 0xf800, 0xf800, 0xf800} };
51 static unsigned short ASM_ARRAY(MMX_grn555) = {0x03e0, 0x03e0, 0x03e0, 0x03e0}; 50 static mmx_t MMX_grn555 = { .uw = {0x03e0, 0x03e0, 0x03e0, 0x03e0} };
52 static unsigned short ASM_ARRAY(MMX_grn565) = {0x07e0, 0x07e0, 0x07e0, 0x07e0}; 51 static mmx_t MMX_grn565 = { .uw = {0x07e0, 0x07e0, 0x07e0, 0x07e0} };
53 static unsigned short ASM_ARRAY(MMX_blu5x5) = {0x001f, 0x001f, 0x001f, 0x001f}; 52 static mmx_t MMX_blu5x5 = { .uw = {0x001f, 0x001f, 0x001f, 0x001f} };
54 53
55 /** 54 /**
56 This MMX assembler is my first assembler/MMX program ever. 55 This MMX assembler is my first assembler/MMX program ever.
57 Thus it maybe buggy. 56 Thus it maybe buggy.
58 Send patches to: 57 Send patches to:
84 void ColorRGBDitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, 83 void ColorRGBDitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix,
85 unsigned char *lum, unsigned char *cr, 84 unsigned char *lum, unsigned char *cr,
86 unsigned char *cb, unsigned char *out, 85 unsigned char *cb, unsigned char *out,
87 int rows, int cols, int mod ) 86 int rows, int cols, int mod )
88 { 87 {
89 Uint32 *row1; 88 Uint32 *row1;
90 Uint32 *row2; 89 Uint32 *row2;
91 90
92 unsigned char* y = lum +cols*rows; // Pointer to the end 91 unsigned char* y = lum +cols*rows; // Pointer to the end
93 int x=0; 92 int x = 0;
94 row1 = (Uint32 *)out; // 32 bit target 93 row1 = (Uint32 *)out; // 32 bit target
95 row2 = (Uint32 *)out+cols+mod; // start of second row 94 row2 = (Uint32 *)out+cols+mod; // start of second row
96 mod = (mod+cols+mod)*4; // increment for row1 in byte 95 mod = (mod+cols+mod)*4; // increment for row1 in byte
97 96
98 __asm__ __volatile__ ( 97 __asm__ __volatile__ (
99 /* We don't really care about PIC - the code should be rewritten to use 98 // tap dance to workaround the inability to use %%ebx at will...
100 relative addressing for the static tables, so right now we take the 99 // move one thing to the stack...
101 COW hit on the pages this code resides. Big deal. 100 "pushl $0\n" // save a slot on the stack.
102 This spill is just to reduce register pressure in the PIC case. */ 101 "pushl %%ebx\n" // save %%ebx.
103 "pushl %%ebx\n" 102 "movl %0, %%ebx\n" // put the thing in ebx.
104 "movl %0, %%ebx\n" 103 "movl %%ebx, 4(%%esp)\n" // put the thing in the stack slot.
105 104 "popl %%ebx\n" // get back %%ebx (the PIC register).
106 ".align 8\n" 105
106 ".align 8\n"
107 "1:\n" 107 "1:\n"
108 108
109 // create Cr (result in mm1) 109 // create Cr (result in mm1)
110 "pushl %%ebx\n"
111 "movl 4(%%esp), %%ebx\n"
110 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0 112 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0
113 "popl %%ebx\n"
111 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 114 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00
112 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0 115 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0
113 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0 116 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0
114 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0 117 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0
115 "psubw _MMX_0080w,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 118 "psubw %9,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0
116 119
117 // create Cr_g (result in mm0) 120 // create Cr_g (result in mm0)
118 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0 121 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0
119 "pmullw _MMX_VgrnRGB,%%mm0\n"// red*-46dec=0.7136*64 122 "pmullw %10,%%mm0\n" // red*-46dec=0.7136*64
120 "pmullw _MMX_VredRGB,%%mm1\n"// red*89dec=1.4013*64 123 "pmullw %11,%%mm1\n" // red*89dec=1.4013*64
121 "psraw $6, %%mm0\n" // red=red/64 124 "psraw $6, %%mm0\n" // red=red/64
122 "psraw $6, %%mm1\n" // red=red/64 125 "psraw $6, %%mm1\n" // red=red/64
123 126
124 // create L1 L2 (result in mm2,mm4) 127 // create L1 L2 (result in mm2,mm4)
125 // L2=lum+cols 128 // L2=lum+cols
126 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0 129 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0
127 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0 130 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0
128 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0 131 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0
129 "pand _MMX_FF00w,%%mm2\n" // L3 0 L1 0 l3 0 l1 0 132 "pand %12,%%mm2\n" // L3 0 L1 0 l3 0 l1 0
130 "pand _MMX_00FFw,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0 133 "pand %13,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0
131 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1 134 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1
132 135
133 // create R (result in mm6) 136 // create R (result in mm6)
134 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1 137 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1
135 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0 138 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0
142 145
143 // create Cb (result in mm1) 146 // create Cb (result in mm1)
144 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0 147 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0
145 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0 148 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0
146 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0 149 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0
147 "psubw _MMX_0080w,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 150 "psubw %9,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0
148 // create Cb_g (result in mm5) 151 // create Cb_g (result in mm5)
149 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0 152 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0
150 "pmullw _MMX_UgrnRGB,%%mm5\n" // blue*-109dec=1.7129*64 153 "pmullw %14,%%mm5\n" // blue*-109dec=1.7129*64
151 "pmullw _MMX_UbluRGB,%%mm1\n" // blue*114dec=1.78125*64 154 "pmullw %15,%%mm1\n" // blue*114dec=1.78125*64
152 "psraw $6, %%mm5\n" // blue=red/64 155 "psraw $6, %%mm5\n" // blue=red/64
153 "psraw $6, %%mm1\n" // blue=blue/64 156 "psraw $6, %%mm1\n" // blue=blue/64
154 157
155 // create G (result in mm7) 158 // create G (result in mm7)
156 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 159 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1
211 "movq %%mm5,8(%5)\n" // wrote out ! row2 214 "movq %%mm5,8(%5)\n" // wrote out ! row2
212 215
213 "addl $4,%2\n" // lum+4 216 "addl $4,%2\n" // lum+4
214 "leal 16(%3),%3\n" // row1+16 217 "leal 16(%3),%3\n" // row1+16
215 "leal 16(%5),%5\n" // row2+16 218 "leal 16(%5),%5\n" // row2+16
216 "addl $2, %%ebx\n" // cr+2 219 "addl $2, (%%esp)\n" // cr+2
217 "addl $2, %1\n" // cb+2 220 "addl $2, %1\n" // cb+2
218 221
219 "addl $4,%6\n" // x+4 222 "addl $4,%6\n" // x+4
220 "cmpl %4,%6\n" 223 "cmpl %4,%6\n"
221 224
224 "addl %8, %3\n" // row1+= mod 227 "addl %8, %3\n" // row1+= mod
225 "addl %8, %5\n" // row2+= mod 228 "addl %8, %5\n" // row2+= mod
226 "movl $0, %6\n" // x=0 229 "movl $0, %6\n" // x=0
227 "cmpl %7, %2\n" 230 "cmpl %7, %2\n"
228 "jl 1b\n" 231 "jl 1b\n"
229 "emms\n" 232
230 "popl %%ebx\n" 233 "addl $4, %%esp\n" // get rid of the stack slot we reserved.
234 "emms\n" // reset MMX registers.
231 : 235 :
232 : "m" (cr), "r"(cb),"r"(lum), 236 : "m" (cr), "r"(cb),"r"(lum),
233 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod)); 237 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod),
238 "m"(MMX_0080w),"m"(MMX_VgrnRGB),"m"(MMX_VredRGB),
239 "m"(MMX_FF00w),"m"(MMX_00FFw),"m"(MMX_UgrnRGB),
240 "m"(MMX_UbluRGB)
241 );
234 } 242 }
235 243
236 void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, 244 void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix,
237 unsigned char *lum, unsigned char *cr, 245 unsigned char *lum, unsigned char *cr,
238 unsigned char *cb, unsigned char *out, 246 unsigned char *cb, unsigned char *out,
247 row2 = (Uint16 *)out+cols+mod; /* start of second row */ 255 row2 = (Uint16 *)out+cols+mod; /* start of second row */
248 mod = (mod+cols+mod)*2; /* increment for row1 in byte */ 256 mod = (mod+cols+mod)*2; /* increment for row1 in byte */
249 257
250 258
251 __asm__ __volatile__( 259 __asm__ __volatile__(
252 "pushl %%ebx\n" 260 // tap dance to workaround the inability to use %%ebx at will...
253 "movl %0, %%ebx\n" 261 // move one thing to the stack...
262 "pushl $0\n" // save a slot on the stack.
263 "pushl %%ebx\n" // save %%ebx.
264 "movl %0, %%ebx\n" // put the thing in ebx.
265 "movl %%ebx, 4(%%esp)\n" // put the thing in the stack slot.
266 "popl %%ebx\n" // get back %%ebx (the PIC register).
254 267
255 ".align 8\n" 268 ".align 8\n"
256 "1:\n" 269 "1:\n"
257 "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0 270 "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0
258 "pxor %%mm7, %%mm7\n" 271 "pxor %%mm7, %%mm7\n"
259 "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0 272 "pushl %%ebx\n"
273 "movl 4(%%esp), %%ebx\n"
274 "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0
275 "popl %%ebx\n"
276
260 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0 277 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0
261 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0 278 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0
262 "psubw _MMX_0080w, %%mm0\n" 279 "psubw %9, %%mm0\n"
263 "psubw _MMX_0080w, %%mm1\n" 280 "psubw %9, %%mm1\n"
264 "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0 281 "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0
265 "movq %%mm1, %%mm3\n" // Cr 282 "movq %%mm1, %%mm3\n" // Cr
266 "pmullw _MMX_Ugrn565, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0 283 "pmullw %10, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0
267 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0 284 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0
268 "pmullw _MMX_Ublu5x5, %%mm0\n" // Cb2blue 285 "pmullw %11, %%mm0\n" // Cb2blue
269 "pand _MMX_00FFw, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0 286 "pand %12, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0
270 "pmullw _MMX_Vgrn565, %%mm3\n" // Cr2green 287 "pmullw %13, %%mm3\n" // Cr2green
271 "movq (%2), %%mm7\n" // L2 288 "movq (%2), %%mm7\n" // L2
272 "pmullw _MMX_Vred5x5, %%mm1\n" // Cr2red 289 "pmullw %14, %%mm1\n" // Cr2red
273 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1 290 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1
274 "pmullw _MMX_Ycoeff, %%mm6\n" // lum1 291 "pmullw %15, %%mm6\n" // lum1
275 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green 292 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green
276 "pmullw _MMX_Ycoeff, %%mm7\n" // lum2 293 "pmullw %15, %%mm7\n" // lum2
277 294
278 "movq %%mm6, %%mm4\n" // lum1 295 "movq %%mm6, %%mm4\n" // lum1
279 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0 296 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0
280 "movq %%mm4, %%mm5\n" // lum1 297 "movq %%mm4, %%mm5\n" // lum1
281 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0 298 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0
289 "packuswb %%mm5, %%mm5\n" // G1 G1 306 "packuswb %%mm5, %%mm5\n" // G1 G1
290 "packuswb %%mm6, %%mm6\n" // B1 B1 307 "packuswb %%mm6, %%mm6\n" // B1 B1
291 "punpcklbw %%mm4, %%mm4\n" 308 "punpcklbw %%mm4, %%mm4\n"
292 "punpcklbw %%mm5, %%mm5\n" 309 "punpcklbw %%mm5, %%mm5\n"
293 310
294 "pand _MMX_red565, %%mm4\n" 311 "pand %16, %%mm4\n"
295 "psllw $3, %%mm5\n" // GREEN 1 312 "psllw $3, %%mm5\n" // GREEN 1
296 "punpcklbw %%mm6, %%mm6\n" 313 "punpcklbw %%mm6, %%mm6\n"
297 "pand _MMX_grn565, %%mm5\n" 314 "pand %17, %%mm5\n"
298 "pand _MMX_red565, %%mm6\n" 315 "pand %16, %%mm6\n"
299 "por %%mm5, %%mm4\n" // 316 "por %%mm5, %%mm4\n" //
300 "psrlw $11, %%mm6\n" // BLUE 1 317 "psrlw $11, %%mm6\n" // BLUE 1
301 "movq %%mm3, %%mm5\n" // lum2 318 "movq %%mm3, %%mm5\n" // lum2
302 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1 319 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1
303 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1 320 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1
307 "movq (%2, %4), %%mm6\n" // L3 load lum2 324 "movq (%2, %4), %%mm6\n" // L3 load lum2
308 "psraw $6, %%mm7\n" 325 "psraw $6, %%mm7\n"
309 "packuswb %%mm3, %%mm3\n" 326 "packuswb %%mm3, %%mm3\n"
310 "packuswb %%mm5, %%mm5\n" 327 "packuswb %%mm5, %%mm5\n"
311 "packuswb %%mm7, %%mm7\n" 328 "packuswb %%mm7, %%mm7\n"
312 "pand _MMX_00FFw, %%mm6\n" // L3 329 "pand %12, %%mm6\n" // L3
313 "punpcklbw %%mm3, %%mm3\n" 330 "punpcklbw %%mm3, %%mm3\n"
314 "punpcklbw %%mm5, %%mm5\n" 331 "punpcklbw %%mm5, %%mm5\n"
315 "pmullw _MMX_Ycoeff, %%mm6\n" // lum3 332 "pmullw %15, %%mm6\n" // lum3
316 "punpcklbw %%mm7, %%mm7\n" 333 "punpcklbw %%mm7, %%mm7\n"
317 "psllw $3, %%mm5\n" // GREEN 2 334 "psllw $3, %%mm5\n" // GREEN 2
318 "pand _MMX_red565, %%mm7\n" 335 "pand %16, %%mm7\n"
319 "pand _MMX_red565, %%mm3\n" 336 "pand %16, %%mm3\n"
320 "psrlw $11, %%mm7\n" // BLUE 2 337 "psrlw $11, %%mm7\n" // BLUE 2
321 "pand _MMX_grn565, %%mm5\n" 338 "pand %17, %%mm5\n"
322 "por %%mm7, %%mm3\n" 339 "por %%mm7, %%mm3\n"
323 "movq (%2,%4), %%mm7\n" // L4 load lum2 340 "movq (%2,%4), %%mm7\n" // L4 load lum2
324 "por %%mm5, %%mm3\n" // 341 "por %%mm5, %%mm3\n" //
325 "psrlw $8, %%mm7\n" // L4 342 "psrlw $8, %%mm7\n" // L4
326 "movq %%mm4, %%mm5\n" 343 "movq %%mm4, %%mm5\n"
327 "punpcklwd %%mm3, %%mm4\n" 344 "punpcklwd %%mm3, %%mm4\n"
328 "pmullw _MMX_Ycoeff, %%mm7\n" // lum4 345 "pmullw %15, %%mm7\n" // lum4
329 "punpckhwd %%mm3, %%mm5\n" 346 "punpckhwd %%mm3, %%mm5\n"
330 347
331 "movq %%mm4, (%3)\n" // write row1 348 "movq %%mm4, (%3)\n" // write row1
332 "movq %%mm5, 8(%3)\n" // write row1 349 "movq %%mm5, 8(%3)\n" // write row1
333 350
350 "packuswb %%mm6, %%mm6\n" 367 "packuswb %%mm6, %%mm6\n"
351 "punpcklbw %%mm4, %%mm4\n" 368 "punpcklbw %%mm4, %%mm4\n"
352 "punpcklbw %%mm5, %%mm5\n" 369 "punpcklbw %%mm5, %%mm5\n"
353 "punpcklbw %%mm6, %%mm6\n" 370 "punpcklbw %%mm6, %%mm6\n"
354 "psllw $3, %%mm5\n" // GREEN 3 371 "psllw $3, %%mm5\n" // GREEN 3
355 "pand _MMX_red565, %%mm4\n" 372 "pand %16, %%mm4\n"
356 "psraw $6, %%mm3\n" // psr 6 373 "psraw $6, %%mm3\n" // psr 6
357 "psraw $6, %%mm0\n" 374 "psraw $6, %%mm0\n"
358 "pand _MMX_red565, %%mm6\n" // BLUE 375 "pand %16, %%mm6\n" // BLUE
359 "pand _MMX_grn565, %%mm5\n" 376 "pand %17, %%mm5\n"
360 "psrlw $11, %%mm6\n" // BLUE 3 377 "psrlw $11, %%mm6\n" // BLUE 3
361 "por %%mm5, %%mm4\n" 378 "por %%mm5, %%mm4\n"
362 "psraw $6, %%mm7\n" 379 "psraw $6, %%mm7\n"
363 "por %%mm6, %%mm4\n" 380 "por %%mm6, %%mm4\n"
364 "packuswb %%mm3, %%mm3\n" 381 "packuswb %%mm3, %%mm3\n"
365 "packuswb %%mm0, %%mm0\n" 382 "packuswb %%mm0, %%mm0\n"
366 "packuswb %%mm7, %%mm7\n" 383 "packuswb %%mm7, %%mm7\n"
367 "punpcklbw %%mm3, %%mm3\n" 384 "punpcklbw %%mm3, %%mm3\n"
368 "punpcklbw %%mm0, %%mm0\n" 385 "punpcklbw %%mm0, %%mm0\n"
369 "punpcklbw %%mm7, %%mm7\n" 386 "punpcklbw %%mm7, %%mm7\n"
370 "pand _MMX_red565, %%mm3\n" 387 "pand %16, %%mm3\n"
371 "pand _MMX_red565, %%mm7\n" // BLUE 388 "pand %16, %%mm7\n" // BLUE
372 "psllw $3, %%mm0\n" // GREEN 4 389 "psllw $3, %%mm0\n" // GREEN 4
373 "psrlw $11, %%mm7\n" 390 "psrlw $11, %%mm7\n"
374 "pand _MMX_grn565, %%mm0\n" 391 "pand %17, %%mm0\n"
375 "por %%mm7, %%mm3\n" 392 "por %%mm7, %%mm3\n"
376 "por %%mm0, %%mm3\n" 393 "por %%mm0, %%mm3\n"
377 394
378 "movq %%mm4, %%mm5\n" 395 "movq %%mm4, %%mm5\n"
379 396
380 "punpcklwd %%mm3, %%mm4\n" 397 "punpcklwd %%mm3, %%mm4\n"
381 "punpckhwd %%mm3, %%mm5\n" 398 "punpckhwd %%mm3, %%mm5\n"
382 399
383 "movq %%mm4, (%5)\n" 400 "movq %%mm4, (%5)\n"
384 "movq %%mm5, 8(%5)\n" 401 "movq %%mm5, 8(%5)\n"
385 402
386 "addl $8, %6\n" 403 "addl $8, %6\n"
387 "addl $8, %2\n" 404 "addl $8, %2\n"
388 "addl $4, %%ebx\n" 405 "addl $4, (%%esp)\n"
389 "addl $4, %1\n" 406 "addl $4, %1\n"
390 "cmpl %4, %6\n" 407 "cmpl %4, %6\n"
391 "leal 16(%3), %3\n" 408 "leal 16(%3), %3\n"
392 "leal 16(%5),%5\n" // row2+16 409 "leal 16(%5),%5\n" // row2+16
393 410
394 411
395 "jl 1b\n" 412 "jl 1b\n"
396 "addl %4, %2\n" // lum += cols 413 "addl %4, %2\n" // lum += cols
397 "addl %8, %3\n" // row1+= mod 414 "addl %8, %3\n" // row1+= mod
398 "addl %8, %5\n" // row2+= mod 415 "addl %8, %5\n" // row2+= mod
399 "movl $0, %6\n" // x=0 416 "movl $0, %6\n" // x=0
400 "cmpl %7, %2\n" 417 "cmpl %7, %2\n"
401 "jl 1b\n" 418 "jl 1b\n"
419 "addl $4, %%esp\n" // get rid of the stack slot we reserved.
402 "emms\n" 420 "emms\n"
403 "popl %%ebx\n"
404 : 421 :
405 :"m" (cr), "r"(cb),"r"(lum), 422 : "m" (cr), "r"(cb),"r"(lum),
406 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod)); 423 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod),
424 "m"(MMX_0080w),"m"(MMX_Ugrn565),"m"(MMX_Ublu5x5),
425 "m"(MMX_00FFw),"m"(MMX_Vgrn565),"m"(MMX_Vred5x5),
426 "m"(MMX_Ycoeff),"m"(MMX_red565),"m"(MMX_grn565));
407 } 427 }
408 428
409 #endif /* GCC i386 inline assembly */ 429 #endif /* GCC3 i386 inline assembly */
410 #endif /* 0 */ 430