Mercurial > sdl-ios-xcode
comparison src/video/SDL_yuv_mmx.c @ 4045:f420bba13676 SDL-1.2
GCC inline asm for MMX YUV processing no longer has textrels and now works when
gcc wants to hog %%ebx for the PIC register.
Fixes Bugzilla #418.
author | Ryan C. Gordon <icculus@icculus.org> |
---|---|
date | Wed, 11 Jul 2007 06:26:22 +0000 |
parents | 40edc79b0926 |
children | 3a9e60224efe |
comparison
equal
deleted
inserted
replaced
4044:009d85e98922 | 4045:f420bba13676 |
---|---|
19 Sam Lantinga | 19 Sam Lantinga |
20 slouken@libsdl.org | 20 slouken@libsdl.org |
21 */ | 21 */ |
22 #include "SDL_config.h" | 22 #include "SDL_config.h" |
23 | 23 |
24 #if 0 /* FIXME: This code needs to be rewritten to reference the static data using relocatable addresses (e.g. http://www.gentoo.org/proj/en/hardened/pic-fix-guide.xml or http://nasm.sourceforge.net/doc/html/nasmdoc8.html#section-8.2) This code currently breaks on systems with readonly text segments (hardened Linux / Intel Mac) */ | 24 #if (__GNUC__ > 2) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES |
25 #if defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES | |
26 | 25 |
27 #include "SDL_stdinc.h" | 26 #include "SDL_stdinc.h" |
28 | 27 |
29 #define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used)) | 28 #include "mmx.h" |
30 | 29 |
31 static unsigned int ASM_ARRAY(MMX_0080w) = {0x00800080, 0x00800080}; | 30 static mmx_t MMX_0080w = { .ud = {0x00800080, 0x00800080} }; |
32 static unsigned int ASM_ARRAY(MMX_00FFw) = {0x00ff00ff, 0x00ff00ff}; | 31 static mmx_t MMX_00FFw = { .ud = {0x00ff00ff, 0x00ff00ff} }; |
33 static unsigned int ASM_ARRAY(MMX_FF00w) = {0xff00ff00, 0xff00ff00}; | 32 static mmx_t MMX_FF00w = { .ud = {0xff00ff00, 0xff00ff00} }; |
34 | 33 |
35 static unsigned short ASM_ARRAY(MMX_Ycoeff) = {0x004a, 0x004a, 0x004a, 0x004a}; | 34 static mmx_t MMX_Ycoeff = { .uw = {0x004a, 0x004a, 0x004a, 0x004a} }; |
36 | 35 |
37 static unsigned short ASM_ARRAY(MMX_UbluRGB) = {0x0072, 0x0072, 0x0072, 0x0072}; | 36 static mmx_t MMX_UbluRGB = { .uw = {0x0072, 0x0072, 0x0072, 0x0072} }; |
38 static unsigned short ASM_ARRAY(MMX_VredRGB) = {0x0059, 0x0059, 0x0059, 0x0059}; | 37 static mmx_t MMX_VredRGB = { .uw = {0x0059, 0x0059, 0x0059, 0x0059} }; |
39 static unsigned short ASM_ARRAY(MMX_UgrnRGB) = {0xffea, 0xffea, 0xffea, 0xffea}; | 38 static mmx_t MMX_UgrnRGB = { .uw = {0xffea, 0xffea, 0xffea, 0xffea} }; |
40 static unsigned short ASM_ARRAY(MMX_VgrnRGB) = {0xffd2, 0xffd2, 0xffd2, 0xffd2}; | 39 static mmx_t MMX_VgrnRGB = { .uw = {0xffd2, 0xffd2, 0xffd2, 0xffd2} }; |
41 | 40 |
42 static unsigned short ASM_ARRAY(MMX_Ublu5x5) = {0x0081, 0x0081, 0x0081, 0x0081}; | 41 static mmx_t MMX_Ublu5x5 = { .uw = {0x0081, 0x0081, 0x0081, 0x0081} }; |
43 static unsigned short ASM_ARRAY(MMX_Vred5x5) = {0x0066, 0x0066, 0x0066, 0x0066}; | 42 static mmx_t MMX_Vred5x5 = { .uw = {0x0066, 0x0066, 0x0066, 0x0066} }; |
44 static unsigned short ASM_ARRAY(MMX_Ugrn555) = {0xffe7, 0xffe7, 0xffe7, 0xffe7}; | 43 static mmx_t MMX_Ugrn555 = { .uw = {0xffe7, 0xffe7, 0xffe7, 0xffe7} }; |
45 static unsigned short ASM_ARRAY(MMX_Vgrn555) = {0xffcc, 0xffcc, 0xffcc, 0xffcc}; | 44 static mmx_t MMX_Vgrn555 = { .uw = {0xffcc, 0xffcc, 0xffcc, 0xffcc} }; |
46 static unsigned short ASM_ARRAY(MMX_Ugrn565) = {0xffe8, 0xffe8, 0xffe8, 0xffe8}; | 45 static mmx_t MMX_Ugrn565 = { .uw = {0xffe8, 0xffe8, 0xffe8, 0xffe8} }; |
47 static unsigned short ASM_ARRAY(MMX_Vgrn565) = {0xffcd, 0xffcd, 0xffcd, 0xffcd}; | 46 static mmx_t MMX_Vgrn565 = { .uw = {0xffcd, 0xffcd, 0xffcd, 0xffcd} }; |
48 | 47 |
49 static unsigned short ASM_ARRAY(MMX_red555) = {0x7c00, 0x7c00, 0x7c00, 0x7c00}; | 48 static mmx_t MMX_red555 = { .uw = {0x7c00, 0x7c00, 0x7c00, 0x7c00} }; |
50 static unsigned short ASM_ARRAY(MMX_red565) = {0xf800, 0xf800, 0xf800, 0xf800}; | 49 static mmx_t MMX_red565 = { .uw = {0xf800, 0xf800, 0xf800, 0xf800} }; |
51 static unsigned short ASM_ARRAY(MMX_grn555) = {0x03e0, 0x03e0, 0x03e0, 0x03e0}; | 50 static mmx_t MMX_grn555 = { .uw = {0x03e0, 0x03e0, 0x03e0, 0x03e0} }; |
52 static unsigned short ASM_ARRAY(MMX_grn565) = {0x07e0, 0x07e0, 0x07e0, 0x07e0}; | 51 static mmx_t MMX_grn565 = { .uw = {0x07e0, 0x07e0, 0x07e0, 0x07e0} }; |
53 static unsigned short ASM_ARRAY(MMX_blu5x5) = {0x001f, 0x001f, 0x001f, 0x001f}; | 52 static mmx_t MMX_blu5x5 = { .uw = {0x001f, 0x001f, 0x001f, 0x001f} }; |
54 | 53 |
55 /** | 54 /** |
56 This MMX assembler is my first assembler/MMX program ever. | 55 This MMX assembler is my first assembler/MMX program ever. |
57 Thus it maybe buggy. | 56 Thus it maybe buggy. |
58 Send patches to: | 57 Send patches to: |
84 void ColorRGBDitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, | 83 void ColorRGBDitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, |
85 unsigned char *lum, unsigned char *cr, | 84 unsigned char *lum, unsigned char *cr, |
86 unsigned char *cb, unsigned char *out, | 85 unsigned char *cb, unsigned char *out, |
87 int rows, int cols, int mod ) | 86 int rows, int cols, int mod ) |
88 { | 87 { |
89 Uint32 *row1; | 88 Uint32 *row1; |
90 Uint32 *row2; | 89 Uint32 *row2; |
91 | 90 |
92 unsigned char* y = lum +cols*rows; // Pointer to the end | 91 unsigned char* y = lum +cols*rows; // Pointer to the end |
93 int x=0; | 92 int x = 0; |
94 row1 = (Uint32 *)out; // 32 bit target | 93 row1 = (Uint32 *)out; // 32 bit target |
95 row2 = (Uint32 *)out+cols+mod; // start of second row | 94 row2 = (Uint32 *)out+cols+mod; // start of second row |
96 mod = (mod+cols+mod)*4; // increment for row1 in byte | 95 mod = (mod+cols+mod)*4; // increment for row1 in byte |
97 | 96 |
98 __asm__ __volatile__ ( | 97 __asm__ __volatile__ ( |
99 /* We don't really care about PIC - the code should be rewritten to use | 98 // tap dance to workaround the inability to use %%ebx at will... |
100 relative addressing for the static tables, so right now we take the | 99 // move one thing to the stack... |
101 COW hit on the pages this code resides. Big deal. | 100 "pushl $0\n" // save a slot on the stack. |
102 This spill is just to reduce register pressure in the PIC case. */ | 101 "pushl %%ebx\n" // save %%ebx. |
103 "pushl %%ebx\n" | 102 "movl %0, %%ebx\n" // put the thing in ebx. |
104 "movl %0, %%ebx\n" | 103 "movl %%ebx, 4(%%esp)\n" // put the thing in the stack slot. |
105 | 104 "popl %%ebx\n" // get back %%ebx (the PIC register). |
106 ".align 8\n" | 105 |
106 ".align 8\n" | |
107 "1:\n" | 107 "1:\n" |
108 | 108 |
109 // create Cr (result in mm1) | 109 // create Cr (result in mm1) |
110 "pushl %%ebx\n" | |
111 "movl 4(%%esp), %%ebx\n" | |
110 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0 | 112 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0 |
113 "popl %%ebx\n" | |
111 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 | 114 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 |
112 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0 | 115 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0 |
113 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0 | 116 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0 |
114 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0 | 117 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0 |
115 "psubw _MMX_0080w,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 | 118 "psubw %9,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 |
116 | 119 |
117 // create Cr_g (result in mm0) | 120 // create Cr_g (result in mm0) |
118 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0 | 121 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0 |
119 "pmullw _MMX_VgrnRGB,%%mm0\n"// red*-46dec=0.7136*64 | 122 "pmullw %10,%%mm0\n" // red*-46dec=0.7136*64 |
120 "pmullw _MMX_VredRGB,%%mm1\n"// red*89dec=1.4013*64 | 123 "pmullw %11,%%mm1\n" // red*89dec=1.4013*64 |
121 "psraw $6, %%mm0\n" // red=red/64 | 124 "psraw $6, %%mm0\n" // red=red/64 |
122 "psraw $6, %%mm1\n" // red=red/64 | 125 "psraw $6, %%mm1\n" // red=red/64 |
123 | 126 |
124 // create L1 L2 (result in mm2,mm4) | 127 // create L1 L2 (result in mm2,mm4) |
125 // L2=lum+cols | 128 // L2=lum+cols |
126 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0 | 129 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0 |
127 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0 | 130 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0 |
128 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0 | 131 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0 |
129 "pand _MMX_FF00w,%%mm2\n" // L3 0 L1 0 l3 0 l1 0 | 132 "pand %12,%%mm2\n" // L3 0 L1 0 l3 0 l1 0 |
130 "pand _MMX_00FFw,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0 | 133 "pand %13,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0 |
131 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1 | 134 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1 |
132 | 135 |
133 // create R (result in mm6) | 136 // create R (result in mm6) |
134 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1 | 137 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1 |
135 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0 | 138 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0 |
142 | 145 |
143 // create Cb (result in mm1) | 146 // create Cb (result in mm1) |
144 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0 | 147 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0 |
145 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0 | 148 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0 |
146 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0 | 149 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0 |
147 "psubw _MMX_0080w,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 | 150 "psubw %9,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 |
148 // create Cb_g (result in mm5) | 151 // create Cb_g (result in mm5) |
149 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0 | 152 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0 |
150 "pmullw _MMX_UgrnRGB,%%mm5\n" // blue*-109dec=1.7129*64 | 153 "pmullw %14,%%mm5\n" // blue*-109dec=1.7129*64 |
151 "pmullw _MMX_UbluRGB,%%mm1\n" // blue*114dec=1.78125*64 | 154 "pmullw %15,%%mm1\n" // blue*114dec=1.78125*64 |
152 "psraw $6, %%mm5\n" // blue=red/64 | 155 "psraw $6, %%mm5\n" // blue=red/64 |
153 "psraw $6, %%mm1\n" // blue=blue/64 | 156 "psraw $6, %%mm1\n" // blue=blue/64 |
154 | 157 |
155 // create G (result in mm7) | 158 // create G (result in mm7) |
156 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 | 159 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 |
211 "movq %%mm5,8(%5)\n" // wrote out ! row2 | 214 "movq %%mm5,8(%5)\n" // wrote out ! row2 |
212 | 215 |
213 "addl $4,%2\n" // lum+4 | 216 "addl $4,%2\n" // lum+4 |
214 "leal 16(%3),%3\n" // row1+16 | 217 "leal 16(%3),%3\n" // row1+16 |
215 "leal 16(%5),%5\n" // row2+16 | 218 "leal 16(%5),%5\n" // row2+16 |
216 "addl $2, %%ebx\n" // cr+2 | 219 "addl $2, (%%esp)\n" // cr+2 |
217 "addl $2, %1\n" // cb+2 | 220 "addl $2, %1\n" // cb+2 |
218 | 221 |
219 "addl $4,%6\n" // x+4 | 222 "addl $4,%6\n" // x+4 |
220 "cmpl %4,%6\n" | 223 "cmpl %4,%6\n" |
221 | 224 |
224 "addl %8, %3\n" // row1+= mod | 227 "addl %8, %3\n" // row1+= mod |
225 "addl %8, %5\n" // row2+= mod | 228 "addl %8, %5\n" // row2+= mod |
226 "movl $0, %6\n" // x=0 | 229 "movl $0, %6\n" // x=0 |
227 "cmpl %7, %2\n" | 230 "cmpl %7, %2\n" |
228 "jl 1b\n" | 231 "jl 1b\n" |
229 "emms\n" | 232 |
230 "popl %%ebx\n" | 233 "addl $4, %%esp\n" // get rid of the stack slot we reserved. |
234 "emms\n" // reset MMX registers. | |
231 : | 235 : |
232 : "m" (cr), "r"(cb),"r"(lum), | 236 : "m" (cr), "r"(cb),"r"(lum), |
233 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod)); | 237 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod), |
238 "m"(MMX_0080w),"m"(MMX_VgrnRGB),"m"(MMX_VredRGB), | |
239 "m"(MMX_FF00w),"m"(MMX_00FFw),"m"(MMX_UgrnRGB), | |
240 "m"(MMX_UbluRGB) | |
241 ); | |
234 } | 242 } |
235 | 243 |
236 void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, | 244 void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, |
237 unsigned char *lum, unsigned char *cr, | 245 unsigned char *lum, unsigned char *cr, |
238 unsigned char *cb, unsigned char *out, | 246 unsigned char *cb, unsigned char *out, |
247 row2 = (Uint16 *)out+cols+mod; /* start of second row */ | 255 row2 = (Uint16 *)out+cols+mod; /* start of second row */ |
248 mod = (mod+cols+mod)*2; /* increment for row1 in byte */ | 256 mod = (mod+cols+mod)*2; /* increment for row1 in byte */ |
249 | 257 |
250 | 258 |
251 __asm__ __volatile__( | 259 __asm__ __volatile__( |
252 "pushl %%ebx\n" | 260 // tap dance to workaround the inability to use %%ebx at will... |
253 "movl %0, %%ebx\n" | 261 // move one thing to the stack... |
262 "pushl $0\n" // save a slot on the stack. | |
263 "pushl %%ebx\n" // save %%ebx. | |
264 "movl %0, %%ebx\n" // put the thing in ebx. | |
265 "movl %%ebx, 4(%%esp)\n" // put the thing in the stack slot. | |
266 "popl %%ebx\n" // get back %%ebx (the PIC register). | |
254 | 267 |
255 ".align 8\n" | 268 ".align 8\n" |
256 "1:\n" | 269 "1:\n" |
257 "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0 | 270 "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0 |
258 "pxor %%mm7, %%mm7\n" | 271 "pxor %%mm7, %%mm7\n" |
259 "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0 | 272 "pushl %%ebx\n" |
273 "movl 4(%%esp), %%ebx\n" | |
274 "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0 | |
275 "popl %%ebx\n" | |
276 | |
260 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0 | 277 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0 |
261 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0 | 278 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0 |
262 "psubw _MMX_0080w, %%mm0\n" | 279 "psubw %9, %%mm0\n" |
263 "psubw _MMX_0080w, %%mm1\n" | 280 "psubw %9, %%mm1\n" |
264 "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0 | 281 "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0 |
265 "movq %%mm1, %%mm3\n" // Cr | 282 "movq %%mm1, %%mm3\n" // Cr |
266 "pmullw _MMX_Ugrn565, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0 | 283 "pmullw %10, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0 |
267 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0 | 284 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0 |
268 "pmullw _MMX_Ublu5x5, %%mm0\n" // Cb2blue | 285 "pmullw %11, %%mm0\n" // Cb2blue |
269 "pand _MMX_00FFw, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0 | 286 "pand %12, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0 |
270 "pmullw _MMX_Vgrn565, %%mm3\n" // Cr2green | 287 "pmullw %13, %%mm3\n" // Cr2green |
271 "movq (%2), %%mm7\n" // L2 | 288 "movq (%2), %%mm7\n" // L2 |
272 "pmullw _MMX_Vred5x5, %%mm1\n" // Cr2red | 289 "pmullw %14, %%mm1\n" // Cr2red |
273 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1 | 290 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1 |
274 "pmullw _MMX_Ycoeff, %%mm6\n" // lum1 | 291 "pmullw %15, %%mm6\n" // lum1 |
275 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green | 292 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green |
276 "pmullw _MMX_Ycoeff, %%mm7\n" // lum2 | 293 "pmullw %15, %%mm7\n" // lum2 |
277 | 294 |
278 "movq %%mm6, %%mm4\n" // lum1 | 295 "movq %%mm6, %%mm4\n" // lum1 |
279 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0 | 296 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0 |
280 "movq %%mm4, %%mm5\n" // lum1 | 297 "movq %%mm4, %%mm5\n" // lum1 |
281 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0 | 298 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0 |
289 "packuswb %%mm5, %%mm5\n" // G1 G1 | 306 "packuswb %%mm5, %%mm5\n" // G1 G1 |
290 "packuswb %%mm6, %%mm6\n" // B1 B1 | 307 "packuswb %%mm6, %%mm6\n" // B1 B1 |
291 "punpcklbw %%mm4, %%mm4\n" | 308 "punpcklbw %%mm4, %%mm4\n" |
292 "punpcklbw %%mm5, %%mm5\n" | 309 "punpcklbw %%mm5, %%mm5\n" |
293 | 310 |
294 "pand _MMX_red565, %%mm4\n" | 311 "pand %16, %%mm4\n" |
295 "psllw $3, %%mm5\n" // GREEN 1 | 312 "psllw $3, %%mm5\n" // GREEN 1 |
296 "punpcklbw %%mm6, %%mm6\n" | 313 "punpcklbw %%mm6, %%mm6\n" |
297 "pand _MMX_grn565, %%mm5\n" | 314 "pand %17, %%mm5\n" |
298 "pand _MMX_red565, %%mm6\n" | 315 "pand %16, %%mm6\n" |
299 "por %%mm5, %%mm4\n" // | 316 "por %%mm5, %%mm4\n" // |
300 "psrlw $11, %%mm6\n" // BLUE 1 | 317 "psrlw $11, %%mm6\n" // BLUE 1 |
301 "movq %%mm3, %%mm5\n" // lum2 | 318 "movq %%mm3, %%mm5\n" // lum2 |
302 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1 | 319 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1 |
303 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1 | 320 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1 |
307 "movq (%2, %4), %%mm6\n" // L3 load lum2 | 324 "movq (%2, %4), %%mm6\n" // L3 load lum2 |
308 "psraw $6, %%mm7\n" | 325 "psraw $6, %%mm7\n" |
309 "packuswb %%mm3, %%mm3\n" | 326 "packuswb %%mm3, %%mm3\n" |
310 "packuswb %%mm5, %%mm5\n" | 327 "packuswb %%mm5, %%mm5\n" |
311 "packuswb %%mm7, %%mm7\n" | 328 "packuswb %%mm7, %%mm7\n" |
312 "pand _MMX_00FFw, %%mm6\n" // L3 | 329 "pand %12, %%mm6\n" // L3 |
313 "punpcklbw %%mm3, %%mm3\n" | 330 "punpcklbw %%mm3, %%mm3\n" |
314 "punpcklbw %%mm5, %%mm5\n" | 331 "punpcklbw %%mm5, %%mm5\n" |
315 "pmullw _MMX_Ycoeff, %%mm6\n" // lum3 | 332 "pmullw %15, %%mm6\n" // lum3 |
316 "punpcklbw %%mm7, %%mm7\n" | 333 "punpcklbw %%mm7, %%mm7\n" |
317 "psllw $3, %%mm5\n" // GREEN 2 | 334 "psllw $3, %%mm5\n" // GREEN 2 |
318 "pand _MMX_red565, %%mm7\n" | 335 "pand %16, %%mm7\n" |
319 "pand _MMX_red565, %%mm3\n" | 336 "pand %16, %%mm3\n" |
320 "psrlw $11, %%mm7\n" // BLUE 2 | 337 "psrlw $11, %%mm7\n" // BLUE 2 |
321 "pand _MMX_grn565, %%mm5\n" | 338 "pand %17, %%mm5\n" |
322 "por %%mm7, %%mm3\n" | 339 "por %%mm7, %%mm3\n" |
323 "movq (%2,%4), %%mm7\n" // L4 load lum2 | 340 "movq (%2,%4), %%mm7\n" // L4 load lum2 |
324 "por %%mm5, %%mm3\n" // | 341 "por %%mm5, %%mm3\n" // |
325 "psrlw $8, %%mm7\n" // L4 | 342 "psrlw $8, %%mm7\n" // L4 |
326 "movq %%mm4, %%mm5\n" | 343 "movq %%mm4, %%mm5\n" |
327 "punpcklwd %%mm3, %%mm4\n" | 344 "punpcklwd %%mm3, %%mm4\n" |
328 "pmullw _MMX_Ycoeff, %%mm7\n" // lum4 | 345 "pmullw %15, %%mm7\n" // lum4 |
329 "punpckhwd %%mm3, %%mm5\n" | 346 "punpckhwd %%mm3, %%mm5\n" |
330 | 347 |
331 "movq %%mm4, (%3)\n" // write row1 | 348 "movq %%mm4, (%3)\n" // write row1 |
332 "movq %%mm5, 8(%3)\n" // write row1 | 349 "movq %%mm5, 8(%3)\n" // write row1 |
333 | 350 |
350 "packuswb %%mm6, %%mm6\n" | 367 "packuswb %%mm6, %%mm6\n" |
351 "punpcklbw %%mm4, %%mm4\n" | 368 "punpcklbw %%mm4, %%mm4\n" |
352 "punpcklbw %%mm5, %%mm5\n" | 369 "punpcklbw %%mm5, %%mm5\n" |
353 "punpcklbw %%mm6, %%mm6\n" | 370 "punpcklbw %%mm6, %%mm6\n" |
354 "psllw $3, %%mm5\n" // GREEN 3 | 371 "psllw $3, %%mm5\n" // GREEN 3 |
355 "pand _MMX_red565, %%mm4\n" | 372 "pand %16, %%mm4\n" |
356 "psraw $6, %%mm3\n" // psr 6 | 373 "psraw $6, %%mm3\n" // psr 6 |
357 "psraw $6, %%mm0\n" | 374 "psraw $6, %%mm0\n" |
358 "pand _MMX_red565, %%mm6\n" // BLUE | 375 "pand %16, %%mm6\n" // BLUE |
359 "pand _MMX_grn565, %%mm5\n" | 376 "pand %17, %%mm5\n" |
360 "psrlw $11, %%mm6\n" // BLUE 3 | 377 "psrlw $11, %%mm6\n" // BLUE 3 |
361 "por %%mm5, %%mm4\n" | 378 "por %%mm5, %%mm4\n" |
362 "psraw $6, %%mm7\n" | 379 "psraw $6, %%mm7\n" |
363 "por %%mm6, %%mm4\n" | 380 "por %%mm6, %%mm4\n" |
364 "packuswb %%mm3, %%mm3\n" | 381 "packuswb %%mm3, %%mm3\n" |
365 "packuswb %%mm0, %%mm0\n" | 382 "packuswb %%mm0, %%mm0\n" |
366 "packuswb %%mm7, %%mm7\n" | 383 "packuswb %%mm7, %%mm7\n" |
367 "punpcklbw %%mm3, %%mm3\n" | 384 "punpcklbw %%mm3, %%mm3\n" |
368 "punpcklbw %%mm0, %%mm0\n" | 385 "punpcklbw %%mm0, %%mm0\n" |
369 "punpcklbw %%mm7, %%mm7\n" | 386 "punpcklbw %%mm7, %%mm7\n" |
370 "pand _MMX_red565, %%mm3\n" | 387 "pand %16, %%mm3\n" |
371 "pand _MMX_red565, %%mm7\n" // BLUE | 388 "pand %16, %%mm7\n" // BLUE |
372 "psllw $3, %%mm0\n" // GREEN 4 | 389 "psllw $3, %%mm0\n" // GREEN 4 |
373 "psrlw $11, %%mm7\n" | 390 "psrlw $11, %%mm7\n" |
374 "pand _MMX_grn565, %%mm0\n" | 391 "pand %17, %%mm0\n" |
375 "por %%mm7, %%mm3\n" | 392 "por %%mm7, %%mm3\n" |
376 "por %%mm0, %%mm3\n" | 393 "por %%mm0, %%mm3\n" |
377 | 394 |
378 "movq %%mm4, %%mm5\n" | 395 "movq %%mm4, %%mm5\n" |
379 | 396 |
380 "punpcklwd %%mm3, %%mm4\n" | 397 "punpcklwd %%mm3, %%mm4\n" |
381 "punpckhwd %%mm3, %%mm5\n" | 398 "punpckhwd %%mm3, %%mm5\n" |
382 | 399 |
383 "movq %%mm4, (%5)\n" | 400 "movq %%mm4, (%5)\n" |
384 "movq %%mm5, 8(%5)\n" | 401 "movq %%mm5, 8(%5)\n" |
385 | 402 |
386 "addl $8, %6\n" | 403 "addl $8, %6\n" |
387 "addl $8, %2\n" | 404 "addl $8, %2\n" |
388 "addl $4, %%ebx\n" | 405 "addl $4, (%%esp)\n" |
389 "addl $4, %1\n" | 406 "addl $4, %1\n" |
390 "cmpl %4, %6\n" | 407 "cmpl %4, %6\n" |
391 "leal 16(%3), %3\n" | 408 "leal 16(%3), %3\n" |
392 "leal 16(%5),%5\n" // row2+16 | 409 "leal 16(%5),%5\n" // row2+16 |
393 | 410 |
394 | 411 |
395 "jl 1b\n" | 412 "jl 1b\n" |
396 "addl %4, %2\n" // lum += cols | 413 "addl %4, %2\n" // lum += cols |
397 "addl %8, %3\n" // row1+= mod | 414 "addl %8, %3\n" // row1+= mod |
398 "addl %8, %5\n" // row2+= mod | 415 "addl %8, %5\n" // row2+= mod |
399 "movl $0, %6\n" // x=0 | 416 "movl $0, %6\n" // x=0 |
400 "cmpl %7, %2\n" | 417 "cmpl %7, %2\n" |
401 "jl 1b\n" | 418 "jl 1b\n" |
419 "addl $4, %%esp\n" // get rid of the stack slot we reserved. | |
402 "emms\n" | 420 "emms\n" |
403 "popl %%ebx\n" | |
404 : | 421 : |
405 :"m" (cr), "r"(cb),"r"(lum), | 422 : "m" (cr), "r"(cb),"r"(lum), |
406 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod)); | 423 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod), |
424 "m"(MMX_0080w),"m"(MMX_Ugrn565),"m"(MMX_Ublu5x5), | |
425 "m"(MMX_00FFw),"m"(MMX_Vgrn565),"m"(MMX_Vred5x5), | |
426 "m"(MMX_Ycoeff),"m"(MMX_red565),"m"(MMX_grn565)); | |
407 } | 427 } |
408 | 428 |
409 #endif /* GCC i386 inline assembly */ | 429 #endif /* GCC3 i386 inline assembly */ |
410 #endif /* 0 */ | 430 |