comparison src/video/SDL_yuv_mmx.c @ 2167:8f2174e22cd5

indent doesn't know how to handle inline asm
author Sam Lantinga <slouken@libsdl.org>
date Wed, 11 Jul 2007 04:47:25 +0000
parents c121d94672cb
children 07f084fe97d0
comparison
equal deleted inserted replaced
2166:711bea885c1e 2167:8f2174e22cd5
19 Sam Lantinga 19 Sam Lantinga
20 slouken@libsdl.org 20 slouken@libsdl.org
21 */ 21 */
22 #include "SDL_config.h" 22 #include "SDL_config.h"
23 23
24 #if 0 /* FIXME: This code needs to be rewritten to reference the static data using relocatable addresses (e.g. http://www.gentoo.org/proj/en/hardened/pic-fix-guide.xml or http://nasm.sourceforge.net/doc/html/nasmdoc8.html#section-8.2) This code currently breaks on systems with readonly text segments (hardened Linux / Intel Mac) */ 24 /* *INDENT-OFF* */
25
26 #if 0 /* FIXME: This code needs to be rewritten to reference the static data using relocatable addresses (e.g. http://www.gentoo.org/proj/en/hardened/pic-fix-guide.xml or http://nasm.sourceforge.net/doc/html/nasmdoc8.html#section-8.2) This code currently breaks on systems with readonly text segments (hardened Linux / Intel Mac) */
25 #if defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES 27 #if defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
26 28
27 #include "SDL_stdinc.h" 29 #include "SDL_stdinc.h"
28 30
29 #define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used)) 31 #define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used))
30 32
31 static unsigned int ASM_ARRAY(MMX_0080w) = { 33 static unsigned int ASM_ARRAY(MMX_0080w) = {0x00800080, 0x00800080};
32 0x00800080, 0x00800080}; 34 static unsigned int ASM_ARRAY(MMX_00FFw) = {0x00ff00ff, 0x00ff00ff};
33 static unsigned int ASM_ARRAY(MMX_00FFw) = { 35 static unsigned int ASM_ARRAY(MMX_FF00w) = {0xff00ff00, 0xff00ff00};
34 0x00ff00ff, 0x00ff00ff}; 36
35 static unsigned int ASM_ARRAY(MMX_FF00w) = { 37 static unsigned short ASM_ARRAY(MMX_Ycoeff) = {0x004a, 0x004a, 0x004a, 0x004a};
36 0xff00ff00, 0xff00ff00}; 38
37 39 static unsigned short ASM_ARRAY(MMX_UbluRGB) = {0x0072, 0x0072, 0x0072, 0x0072};
38 static unsigned short ASM_ARRAY(MMX_Ycoeff) = { 40 static unsigned short ASM_ARRAY(MMX_VredRGB) = {0x0059, 0x0059, 0x0059, 0x0059};
39 0x004a, 0x004a, 0x004a, 0x004a}; 41 static unsigned short ASM_ARRAY(MMX_UgrnRGB) = {0xffea, 0xffea, 0xffea, 0xffea};
40 42 static unsigned short ASM_ARRAY(MMX_VgrnRGB) = {0xffd2, 0xffd2, 0xffd2, 0xffd2};
41 static unsigned short ASM_ARRAY(MMX_UbluRGB) = { 43
42 0x0072, 0x0072, 0x0072, 0x0072}; 44 static unsigned short ASM_ARRAY(MMX_Ublu5x5) = {0x0081, 0x0081, 0x0081, 0x0081};
43 static unsigned short ASM_ARRAY(MMX_VredRGB) = { 45 static unsigned short ASM_ARRAY(MMX_Vred5x5) = {0x0066, 0x0066, 0x0066, 0x0066};
44 0x0059, 0x0059, 0x0059, 0x0059}; 46 static unsigned short ASM_ARRAY(MMX_Ugrn555) = {0xffe7, 0xffe7, 0xffe7, 0xffe7};
45 static unsigned short ASM_ARRAY(MMX_UgrnRGB) = { 47 static unsigned short ASM_ARRAY(MMX_Vgrn555) = {0xffcc, 0xffcc, 0xffcc, 0xffcc};
46 0xffea, 0xffea, 0xffea, 0xffea}; 48 static unsigned short ASM_ARRAY(MMX_Ugrn565) = {0xffe8, 0xffe8, 0xffe8, 0xffe8};
47 static unsigned short ASM_ARRAY(MMX_VgrnRGB) = { 49 static unsigned short ASM_ARRAY(MMX_Vgrn565) = {0xffcd, 0xffcd, 0xffcd, 0xffcd};
48 0xffd2, 0xffd2, 0xffd2, 0xffd2}; 50
49 51 static unsigned short ASM_ARRAY(MMX_red555) = {0x7c00, 0x7c00, 0x7c00, 0x7c00};
50 static unsigned short ASM_ARRAY(MMX_Ublu5x5) = { 52 static unsigned short ASM_ARRAY(MMX_red565) = {0xf800, 0xf800, 0xf800, 0xf800};
51 0x0081, 0x0081, 0x0081, 0x0081}; 53 static unsigned short ASM_ARRAY(MMX_grn555) = {0x03e0, 0x03e0, 0x03e0, 0x03e0};
52 static unsigned short ASM_ARRAY(MMX_Vred5x5) = { 54 static unsigned short ASM_ARRAY(MMX_grn565) = {0x07e0, 0x07e0, 0x07e0, 0x07e0};
53 0x0066, 0x0066, 0x0066, 0x0066}; 55 static unsigned short ASM_ARRAY(MMX_blu5x5) = {0x001f, 0x001f, 0x001f, 0x001f};
54 static unsigned short ASM_ARRAY(MMX_Ugrn555) = {
55 0xffe7, 0xffe7, 0xffe7, 0xffe7};
56 static unsigned short ASM_ARRAY(MMX_Vgrn555) = {
57 0xffcc, 0xffcc, 0xffcc, 0xffcc};
58 static unsigned short ASM_ARRAY(MMX_Ugrn565) = {
59 0xffe8, 0xffe8, 0xffe8, 0xffe8};
60 static unsigned short ASM_ARRAY(MMX_Vgrn565) = {
61 0xffcd, 0xffcd, 0xffcd, 0xffcd};
62
63 static unsigned short ASM_ARRAY(MMX_red555) = {
64 0x7c00, 0x7c00, 0x7c00, 0x7c00};
65 static unsigned short ASM_ARRAY(MMX_red565) = {
66 0xf800, 0xf800, 0xf800, 0xf800};
67 static unsigned short ASM_ARRAY(MMX_grn555) = {
68 0x03e0, 0x03e0, 0x03e0, 0x03e0};
69 static unsigned short ASM_ARRAY(MMX_grn565) = {
70 0x07e0, 0x07e0, 0x07e0, 0x07e0};
71 static unsigned short ASM_ARRAY(MMX_blu5x5) = {
72 0x001f, 0x001f, 0x001f, 0x001f};
73 56
74 /** 57 /**
75 This MMX assembler is my first assembler/MMX program ever. 58 This MMX assembler is my first assembler/MMX program ever.
76 Thus it maybe buggy. 59 Thus it maybe buggy.
77 Send patches to: 60 Send patches to:
98 It is a requirement that the cr/cb/lum are 8 byte aligned and 81 It is a requirement that the cr/cb/lum are 8 byte aligned and
99 the out are 16byte aligned or you will/may get segfaults 82 the out are 16byte aligned or you will/may get segfaults
100 83
101 */ 84 */
102 85
103 void 86 void ColorRGBDitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix,
104 ColorRGBDitherYV12MMX1X(int *colortab, Uint32 * rgb_2_pix, 87 unsigned char *lum, unsigned char *cr,
105 unsigned char *lum, unsigned char *cr, 88 unsigned char *cb, unsigned char *out,
106 unsigned char *cb, unsigned char *out, 89 int rows, int cols, int mod )
107 int rows, int cols, int mod)
108 { 90 {
109 Uint32 *row1; 91 Uint32 *row1;
110 Uint32 *row2; 92 Uint32 *row2;
111 93
112 unsigned char *y = lum + cols * rows; // Pointer to the end 94 unsigned char* y = lum +cols*rows; // Pointer to the end
113 int x = 0; 95 int x=0;
114 row1 = (Uint32 *) out; // 32 bit target 96 row1 = (Uint32 *)out; // 32 bit target
115 row2 = (Uint32 *) out + cols + mod; // start of second row 97 row2 = (Uint32 *)out+cols+mod; // start of second row
116 mod = (mod + cols + mod) * 4; // increment for row1 in byte 98 mod = (mod+cols+mod)*4; // increment for row1 in byte
117 99
118 __asm__ __volatile__( 100 __asm__ __volatile__ (
119 /* We don't really care about PIC - the code should be rewritten to use 101 /* We don't really care about PIC - the code should be rewritten to use
120 relative addressing for the static tables, so right now we take the 102 relative addressing for the static tables, so right now we take the
121 COW hit on the pages this code resides. Big deal. 103 COW hit on the pages this code resides. Big deal.
122 This spill is just to reduce register pressure in the PIC case. */ 104 This spill is just to reduce register pressure in the PIC case. */
123 "pushl %%ebx\n" 105 "pushl %%ebx\n"
124 "movl %0, %%ebx\n" ".align 8\n" "1:\n" 106 "movl %0, %%ebx\n"
125 // create Cr (result in mm1) 107
126 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0 108 ".align 8\n"
127 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 109 "1:\n"
128 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0 110
129 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0 111 // create Cr (result in mm1)
130 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0 112 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0
131 "psubw _MMX_0080w,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 113 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00
132 // create Cr_g (result in mm0) 114 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0
133 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0 115 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0
134 "pmullw _MMX_VgrnRGB,%%mm0\n" // red*-46dec=0.7136*64 116 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0
135 "pmullw _MMX_VredRGB,%%mm1\n" // red*89dec=1.4013*64 117 "psubw _MMX_0080w,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0
136 "psraw $6, %%mm0\n" // red=red/64 118
137 "psraw $6, %%mm1\n" // red=red/64 119 // create Cr_g (result in mm0)
138 // create L1 L2 (result in mm2,mm4) 120 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0
139 // L2=lum+cols 121 "pmullw _MMX_VgrnRGB,%%mm0\n"// red*-46dec=0.7136*64
140 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0 122 "pmullw _MMX_VredRGB,%%mm1\n"// red*89dec=1.4013*64
141 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0 123 "psraw $6, %%mm0\n" // red=red/64
142 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0 124 "psraw $6, %%mm1\n" // red=red/64
143 "pand _MMX_FF00w,%%mm2\n" // L3 0 L1 0 l3 0 l1 0 125
144 "pand _MMX_00FFw,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0 126 // create L1 L2 (result in mm2,mm4)
145 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1 127 // L2=lum+cols
146 // create R (result in mm6) 128 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0
147 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1 129 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0
148 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0 130 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0
149 "paddsw %%mm1, %%mm5\n" // lum1+red:x R3 x R1 x r3 x r1 131 "pand _MMX_FF00w,%%mm2\n" // L3 0 L1 0 l3 0 l1 0
150 "paddsw %%mm1, %%mm6\n" // lum1+red:x R2 x R0 x r2 x r0 132 "pand _MMX_00FFw,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0
151 "packuswb %%mm5,%%mm5\n" // R3 R1 r3 r1 R3 R1 r3 r1 133 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1
152 "packuswb %%mm6,%%mm6\n" // R2 R0 r2 r0 R2 R0 r2 r0 134
153 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 135 // create R (result in mm6)
154 "punpcklbw %%mm5,%%mm6\n" // R3 R2 R1 R0 r3 r2 r1 r0 136 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1
155 // create Cb (result in mm1) 137 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0
156 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0 138 "paddsw %%mm1, %%mm5\n" // lum1+red:x R3 x R1 x r3 x r1
157 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0 139 "paddsw %%mm1, %%mm6\n" // lum1+red:x R2 x R0 x r2 x r0
158 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0 140 "packuswb %%mm5,%%mm5\n" // R3 R1 r3 r1 R3 R1 r3 r1
159 "psubw _MMX_0080w,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 141 "packuswb %%mm6,%%mm6\n" // R2 R0 r2 r0 R2 R0 r2 r0
160 // create Cb_g (result in mm5) 142 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00
161 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0 143 "punpcklbw %%mm5,%%mm6\n" // R3 R2 R1 R0 r3 r2 r1 r0
162 "pmullw _MMX_UgrnRGB,%%mm5\n" // blue*-109dec=1.7129*64 144
163 "pmullw _MMX_UbluRGB,%%mm1\n" // blue*114dec=1.78125*64 145 // create Cb (result in mm1)
164 "psraw $6, %%mm5\n" // blue=red/64 146 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0
165 "psraw $6, %%mm1\n" // blue=blue/64 147 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0
166 // create G (result in mm7) 148 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0
167 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 149 "psubw _MMX_0080w,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0
168 "movq %%mm4,%%mm7\n" // 0 L2 0 L0 0 l2 0 l1 150 // create Cb_g (result in mm5)
169 "paddsw %%mm5, %%mm3\n" // lum1+Cb_g:x G3t x G1t x g3t x g1t 151 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0
170 "paddsw %%mm5, %%mm7\n" // lum1+Cb_g:x G2t x G0t x g2t x g0t 152 "pmullw _MMX_UgrnRGB,%%mm5\n" // blue*-109dec=1.7129*64
171 "paddsw %%mm0, %%mm3\n" // lum1+Cr_g:x G3 x G1 x g3 x g1 153 "pmullw _MMX_UbluRGB,%%mm1\n" // blue*114dec=1.78125*64
172 "paddsw %%mm0, %%mm7\n" // lum1+blue:x G2 x G0 x g2 x g0 154 "psraw $6, %%mm5\n" // blue=red/64
173 "packuswb %%mm3,%%mm3\n" // G3 G1 g3 g1 G3 G1 g3 g1 155 "psraw $6, %%mm1\n" // blue=blue/64
174 "packuswb %%mm7,%%mm7\n" // G2 G0 g2 g0 G2 G0 g2 g0 156
175 "punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0 157 // create G (result in mm7)
176 // create B (result in mm5) 158 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1
177 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 159 "movq %%mm4,%%mm7\n" // 0 L2 0 L0 0 l2 0 l1
178 "movq %%mm4,%%mm5\n" // 0 L2 0 L0 0 l2 0 l1 160 "paddsw %%mm5, %%mm3\n" // lum1+Cb_g:x G3t x G1t x g3t x g1t
179 "paddsw %%mm1, %%mm3\n" // lum1+blue:x B3 x B1 x b3 x b1 161 "paddsw %%mm5, %%mm7\n" // lum1+Cb_g:x G2t x G0t x g2t x g0t
180 "paddsw %%mm1, %%mm5\n" // lum1+blue:x B2 x B0 x b2 x b0 162 "paddsw %%mm0, %%mm3\n" // lum1+Cr_g:x G3 x G1 x g3 x g1
181 "packuswb %%mm3,%%mm3\n" // B3 B1 b3 b1 B3 B1 b3 b1 163 "paddsw %%mm0, %%mm7\n" // lum1+blue:x G2 x G0 x g2 x g0
182 "packuswb %%mm5,%%mm5\n" // B2 B0 b2 b0 B2 B0 b2 b0 164 "packuswb %%mm3,%%mm3\n" // G3 G1 g3 g1 G3 G1 g3 g1
183 "punpcklbw %%mm3,%%mm5\n" // B3 B2 B1 B0 b3 b2 b1 b0 165 "packuswb %%mm7,%%mm7\n" // G2 G0 g2 g0 G2 G0 g2 g0
184 // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb) 166 "punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0
185 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 167
186 "pxor %%mm4,%%mm4\n" // 0 0 0 0 0 0 0 0 168 // create B (result in mm5)
187 "movq %%mm6,%%mm1\n" // R3 R2 R1 R0 r3 r2 r1 r0 169 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1
188 "movq %%mm5,%%mm3\n" // B3 B2 B1 B0 b3 b2 b1 b0 170 "movq %%mm4,%%mm5\n" // 0 L2 0 L0 0 l2 0 l1
189 // process lower lum 171 "paddsw %%mm1, %%mm3\n" // lum1+blue:x B3 x B1 x b3 x b1
190 "punpcklbw %%mm4,%%mm1\n" // 0 r3 0 r2 0 r1 0 r0 172 "paddsw %%mm1, %%mm5\n" // lum1+blue:x B2 x B0 x b2 x b0
191 "punpcklbw %%mm4,%%mm3\n" // 0 b3 0 b2 0 b1 0 b0 173 "packuswb %%mm3,%%mm3\n" // B3 B1 b3 b1 B3 B1 b3 b1
192 "movq %%mm1,%%mm2\n" // 0 r3 0 r2 0 r1 0 r0 174 "packuswb %%mm5,%%mm5\n" // B2 B0 b2 b0 B2 B0 b2 b0
193 "movq %%mm3,%%mm0\n" // 0 b3 0 b2 0 b1 0 b0 175 "punpcklbw %%mm3,%%mm5\n" // B3 B2 B1 B0 b3 b2 b1 b0
194 "punpcklwd %%mm1,%%mm3\n" // 0 r1 0 b1 0 r0 0 b0 176
195 "punpckhwd %%mm2,%%mm0\n" // 0 r3 0 b3 0 r2 0 b2 177 // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
196 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 178
197 "movq %%mm7,%%mm1\n" // G3 G2 G1 G0 g3 g2 g1 g0 179 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
198 "punpcklbw %%mm1,%%mm2\n" // g3 0 g2 0 g1 0 g0 0 180 "pxor %%mm4,%%mm4\n" // 0 0 0 0 0 0 0 0
199 "punpcklwd %%mm4,%%mm2\n" // 0 0 g1 0 0 0 g0 0 181 "movq %%mm6,%%mm1\n" // R3 R2 R1 R0 r3 r2 r1 r0
200 "por %%mm3, %%mm2\n" // 0 r1 g1 b1 0 r0 g0 b0 182 "movq %%mm5,%%mm3\n" // B3 B2 B1 B0 b3 b2 b1 b0
201 "movq %%mm2,(%3)\n" // wrote out ! row1 183 // process lower lum
202 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 184 "punpcklbw %%mm4,%%mm1\n" // 0 r3 0 r2 0 r1 0 r0
203 "punpcklbw %%mm1,%%mm4\n" // g3 0 g2 0 g1 0 g0 0 185 "punpcklbw %%mm4,%%mm3\n" // 0 b3 0 b2 0 b1 0 b0
204 "punpckhwd %%mm2,%%mm4\n" // 0 0 g3 0 0 0 g2 0 186 "movq %%mm1,%%mm2\n" // 0 r3 0 r2 0 r1 0 r0
205 "por %%mm0, %%mm4\n" // 0 r3 g3 b3 0 r2 g2 b2 187 "movq %%mm3,%%mm0\n" // 0 b3 0 b2 0 b1 0 b0
206 "movq %%mm4,8(%3)\n" // wrote out ! row1 188 "punpcklwd %%mm1,%%mm3\n" // 0 r1 0 b1 0 r0 0 b0
207 // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb) 189 "punpckhwd %%mm2,%%mm0\n" // 0 r3 0 b3 0 r2 0 b2
208 // this can be done "destructive" 190
209 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 191 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
210 "punpckhbw %%mm2,%%mm6\n" // 0 R3 0 R2 0 R1 0 R0 192 "movq %%mm7,%%mm1\n" // G3 G2 G1 G0 g3 g2 g1 g0
211 "punpckhbw %%mm1,%%mm5\n" // G3 B3 G2 B2 G1 B1 G0 B0 193 "punpcklbw %%mm1,%%mm2\n" // g3 0 g2 0 g1 0 g0 0
212 "movq %%mm5,%%mm1\n" // G3 B3 G2 B2 G1 B1 G0 B0 194 "punpcklwd %%mm4,%%mm2\n" // 0 0 g1 0 0 0 g0 0
213 "punpcklwd %%mm6,%%mm1\n" // 0 R1 G1 B1 0 R0 G0 B0 195 "por %%mm3, %%mm2\n" // 0 r1 g1 b1 0 r0 g0 b0
214 "movq %%mm1,(%5)\n" // wrote out ! row2 196 "movq %%mm2,(%3)\n" // wrote out ! row1
215 "punpckhwd %%mm6,%%mm5\n" // 0 R3 G3 B3 0 R2 G2 B2 197
216 "movq %%mm5,8(%5)\n" // wrote out ! row2 198 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
217 "addl $4,%2\n" // lum+4 199 "punpcklbw %%mm1,%%mm4\n" // g3 0 g2 0 g1 0 g0 0
218 "leal 16(%3),%3\n" // row1+16 200 "punpckhwd %%mm2,%%mm4\n" // 0 0 g3 0 0 0 g2 0
219 "leal 16(%5),%5\n" // row2+16 201 "por %%mm0, %%mm4\n" // 0 r3 g3 b3 0 r2 g2 b2
220 "addl $2, %%ebx\n" // cr+2 202 "movq %%mm4,8(%3)\n" // wrote out ! row1
221 "addl $2, %1\n" // cb+2 203
222 "addl $4,%6\n" // x+4 204 // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
223 "cmpl %4,%6\n" "jl 1b\n" "addl %4, %2\n" // lum += cols 205 // this can be done "destructive"
224 "addl %8, %3\n" // row1+= mod 206 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
225 "addl %8, %5\n" // row2+= mod 207 "punpckhbw %%mm2,%%mm6\n" // 0 R3 0 R2 0 R1 0 R0
226 "movl $0, %6\n" // x=0 208 "punpckhbw %%mm1,%%mm5\n" // G3 B3 G2 B2 G1 B1 G0 B0
227 "cmpl %7, %2\n" 209 "movq %%mm5,%%mm1\n" // G3 B3 G2 B2 G1 B1 G0 B0
228 "jl 1b\n" 210 "punpcklwd %%mm6,%%mm1\n" // 0 R1 G1 B1 0 R0 G0 B0
229 "emms\n" 211 "movq %%mm1,(%5)\n" // wrote out ! row2
230 "popl %%ebx\n"::"m"(cr), "r"(cb), "r"(lum), 212 "punpckhwd %%mm6,%%mm5\n" // 0 R3 G3 B3 0 R2 G2 B2
231 "r"(row1), "r"(cols), "r"(row2), "m"(x), 213 "movq %%mm5,8(%5)\n" // wrote out ! row2
232 "m"(y), "m"(mod)); 214
215 "addl $4,%2\n" // lum+4
216 "leal 16(%3),%3\n" // row1+16
217 "leal 16(%5),%5\n" // row2+16
218 "addl $2, %%ebx\n" // cr+2
219 "addl $2, %1\n" // cb+2
220
221 "addl $4,%6\n" // x+4
222 "cmpl %4,%6\n"
223
224 "jl 1b\n"
225 "addl %4, %2\n" // lum += cols
226 "addl %8, %3\n" // row1+= mod
227 "addl %8, %5\n" // row2+= mod
228 "movl $0, %6\n" // x=0
229 "cmpl %7, %2\n"
230 "jl 1b\n"
231 "emms\n"
232 "popl %%ebx\n"
233 :
234 : "m" (cr), "r"(cb),"r"(lum),
235 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod));
233 } 236 }
234 237
235 void 238 void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix,
236 Color565DitherYV12MMX1X(int *colortab, Uint32 * rgb_2_pix, 239 unsigned char *lum, unsigned char *cr,
237 unsigned char *lum, unsigned char *cr, 240 unsigned char *cb, unsigned char *out,
238 unsigned char *cb, unsigned char *out, 241 int rows, int cols, int mod )
239 int rows, int cols, int mod)
240 { 242 {
241 Uint16 *row1; 243 Uint16 *row1;
242 Uint16 *row2; 244 Uint16 *row2;
243 245
244 unsigned char *y = lum + cols * rows; /* Pointer to the end */ 246 unsigned char* y = lum +cols*rows; /* Pointer to the end */
245 int x = 0; 247 int x=0;
246 row1 = (Uint16 *) out; /* 16 bit target */ 248 row1 = (Uint16 *)out; /* 16 bit target */
247 row2 = (Uint16 *) out + cols + mod; /* start of second row */ 249 row2 = (Uint16 *)out+cols+mod; /* start of second row */
248 mod = (mod + cols + mod) * 2; /* increment for row1 in byte */ 250 mod = (mod+cols+mod)*2; /* increment for row1 in byte */
249 251
250 252
251 __asm__ __volatile__("pushl %%ebx\n" "movl %0, %%ebx\n" ".align 8\n" "1:\n" "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0 253 __asm__ __volatile__(
252 "pxor %%mm7, %%mm7\n" "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0 254 "pushl %%ebx\n"
253 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0 255 "movl %0, %%ebx\n"
254 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0 256
255 "psubw _MMX_0080w, %%mm0\n" "psubw _MMX_0080w, %%mm1\n" "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0 257 ".align 8\n"
256 "movq %%mm1, %%mm3\n" // Cr 258 "1:\n"
257 "pmullw _MMX_Ugrn565, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0 259 "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0
258 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0 260 "pxor %%mm7, %%mm7\n"
259 "pmullw _MMX_Ublu5x5, %%mm0\n" // Cb2blue 261 "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0
260 "pand _MMX_00FFw, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0 262 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0
261 "pmullw _MMX_Vgrn565, %%mm3\n" // Cr2green 263 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0
262 "movq (%2), %%mm7\n" // L2 264 "psubw _MMX_0080w, %%mm0\n"
263 "pmullw _MMX_Vred5x5, %%mm1\n" // Cr2red 265 "psubw _MMX_0080w, %%mm1\n"
264 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1 266 "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0
265 "pmullw _MMX_Ycoeff, %%mm6\n" // lum1 267 "movq %%mm1, %%mm3\n" // Cr
266 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green 268 "pmullw _MMX_Ugrn565, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0
267 "pmullw _MMX_Ycoeff, %%mm7\n" // lum2 269 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0
268 "movq %%mm6, %%mm4\n" // lum1 270 "pmullw _MMX_Ublu5x5, %%mm0\n" // Cb2blue
269 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0 271 "pand _MMX_00FFw, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0
270 "movq %%mm4, %%mm5\n" // lum1 272 "pmullw _MMX_Vgrn565, %%mm3\n" // Cr2green
271 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0 273 "movq (%2), %%mm7\n" // L2
272 "paddw %%mm2, %%mm5\n" // lum1 +green 00 G6 00 G4 00 G2 00 G0 274 "pmullw _MMX_Vred5x5, %%mm1\n" // Cr2red
273 "psraw $6, %%mm4\n" // R1 0 .. 64 275 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1
274 "movq %%mm7, %%mm3\n" // lum2 00 L7 00 L5 00 L3 00 L1 276 "pmullw _MMX_Ycoeff, %%mm6\n" // lum1
275 "psraw $6, %%mm5\n" // G1 - .. + 277 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green
276 "paddw %%mm0, %%mm7\n" // Lum2 +blue 00 B7 00 B5 00 B3 00 B1 278 "pmullw _MMX_Ycoeff, %%mm7\n" // lum2
277 "psraw $6, %%mm6\n" // B1 0 .. 64 279
278 "packuswb %%mm4, %%mm4\n" // R1 R1 280 "movq %%mm6, %%mm4\n" // lum1
279 "packuswb %%mm5, %%mm5\n" // G1 G1 281 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0
280 "packuswb %%mm6, %%mm6\n" // B1 B1 282 "movq %%mm4, %%mm5\n" // lum1
281 "punpcklbw %%mm4, %%mm4\n" "punpcklbw %%mm5, %%mm5\n" "pand _MMX_red565, %%mm4\n" "psllw $3, %%mm5\n" // GREEN 1 283 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0
282 "punpcklbw %%mm6, %%mm6\n" "pand _MMX_grn565, %%mm5\n" "pand _MMX_red565, %%mm6\n" "por %%mm5, %%mm4\n" // 284 "paddw %%mm2, %%mm5\n" // lum1 +green 00 G6 00 G4 00 G2 00 G0
283 "psrlw $11, %%mm6\n" // BLUE 1 285 "psraw $6, %%mm4\n" // R1 0 .. 64
284 "movq %%mm3, %%mm5\n" // lum2 286 "movq %%mm7, %%mm3\n" // lum2 00 L7 00 L5 00 L3 00 L1
285 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1 287 "psraw $6, %%mm5\n" // G1 - .. +
286 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1 288 "paddw %%mm0, %%mm7\n" // Lum2 +blue 00 B7 00 B5 00 B3 00 B1
287 "psraw $6, %%mm3\n" // R2 289 "psraw $6, %%mm6\n" // B1 0 .. 64
288 "por %%mm6, %%mm4\n" // MM4 290 "packuswb %%mm4, %%mm4\n" // R1 R1
289 "psraw $6, %%mm5\n" // G2 291 "packuswb %%mm5, %%mm5\n" // G1 G1
290 "movq (%2, %4), %%mm6\n" // L3 load lum2 292 "packuswb %%mm6, %%mm6\n" // B1 B1
291 "psraw $6, %%mm7\n" "packuswb %%mm3, %%mm3\n" "packuswb %%mm5, %%mm5\n" "packuswb %%mm7, %%mm7\n" "pand _MMX_00FFw, %%mm6\n" // L3 293 "punpcklbw %%mm4, %%mm4\n"
292 "punpcklbw %%mm3, %%mm3\n" "punpcklbw %%mm5, %%mm5\n" "pmullw _MMX_Ycoeff, %%mm6\n" // lum3 294 "punpcklbw %%mm5, %%mm5\n"
293 "punpcklbw %%mm7, %%mm7\n" "psllw $3, %%mm5\n" // GREEN 2 295
294 "pand _MMX_red565, %%mm7\n" "pand _MMX_red565, %%mm3\n" "psrlw $11, %%mm7\n" // BLUE 2 296 "pand _MMX_red565, %%mm4\n"
295 "pand _MMX_grn565, %%mm5\n" "por %%mm7, %%mm3\n" "movq (%2,%4), %%mm7\n" // L4 load lum2 297 "psllw $3, %%mm5\n" // GREEN 1
296 "por %%mm5, %%mm3\n" // 298 "punpcklbw %%mm6, %%mm6\n"
297 "psrlw $8, %%mm7\n" // L4 299 "pand _MMX_grn565, %%mm5\n"
298 "movq %%mm4, %%mm5\n" "punpcklwd %%mm3, %%mm4\n" "pmullw _MMX_Ycoeff, %%mm7\n" // lum4 300 "pand _MMX_red565, %%mm6\n"
299 "punpckhwd %%mm3, %%mm5\n" "movq %%mm4, (%3)\n" // write row1 301 "por %%mm5, %%mm4\n" //
300 "movq %%mm5, 8(%3)\n" // write row1 302 "psrlw $11, %%mm6\n" // BLUE 1
301 "movq %%mm6, %%mm4\n" // Lum3 303 "movq %%mm3, %%mm5\n" // lum2
302 "paddw %%mm0, %%mm6\n" // Lum3 +blue 304 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1
303 "movq %%mm4, %%mm5\n" // Lum3 305 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1
304 "paddw %%mm1, %%mm4\n" // Lum3 +red 306 "psraw $6, %%mm3\n" // R2
305 "paddw %%mm2, %%mm5\n" // Lum3 +green 307 "por %%mm6, %%mm4\n" // MM4
306 "psraw $6, %%mm4\n" "movq %%mm7, %%mm3\n" // Lum4 308 "psraw $6, %%mm5\n" // G2
307 "psraw $6, %%mm5\n" "paddw %%mm0, %%mm7\n" // Lum4 +blue 309 "movq (%2, %4), %%mm6\n" // L3 load lum2
308 "psraw $6, %%mm6\n" // Lum3 +blue 310 "psraw $6, %%mm7\n"
309 "movq %%mm3, %%mm0\n" // Lum4 311 "packuswb %%mm3, %%mm3\n"
310 "packuswb %%mm4, %%mm4\n" "paddw %%mm1, %%mm3\n" // Lum4 +red 312 "packuswb %%mm5, %%mm5\n"
311 "packuswb %%mm5, %%mm5\n" "paddw %%mm2, %%mm0\n" // Lum4 +green 313 "packuswb %%mm7, %%mm7\n"
312 "packuswb %%mm6, %%mm6\n" "punpcklbw %%mm4, %%mm4\n" "punpcklbw %%mm5, %%mm5\n" "punpcklbw %%mm6, %%mm6\n" "psllw $3, %%mm5\n" // GREEN 3 314 "pand _MMX_00FFw, %%mm6\n" // L3
313 "pand _MMX_red565, %%mm4\n" "psraw $6, %%mm3\n" // psr 6 315 "punpcklbw %%mm3, %%mm3\n"
314 "psraw $6, %%mm0\n" "pand _MMX_red565, %%mm6\n" // BLUE 316 "punpcklbw %%mm5, %%mm5\n"
315 "pand _MMX_grn565, %%mm5\n" "psrlw $11, %%mm6\n" // BLUE 3 317 "pmullw _MMX_Ycoeff, %%mm6\n" // lum3
316 "por %%mm5, %%mm4\n" "psraw $6, %%mm7\n" "por %%mm6, %%mm4\n" "packuswb %%mm3, %%mm3\n" "packuswb %%mm0, %%mm0\n" "packuswb %%mm7, %%mm7\n" "punpcklbw %%mm3, %%mm3\n" "punpcklbw %%mm0, %%mm0\n" "punpcklbw %%mm7, %%mm7\n" "pand _MMX_red565, %%mm3\n" "pand _MMX_red565, %%mm7\n" // BLUE 318 "punpcklbw %%mm7, %%mm7\n"
317 "psllw $3, %%mm0\n" // GREEN 4 319 "psllw $3, %%mm5\n" // GREEN 2
318 "psrlw $11, %%mm7\n" "pand _MMX_grn565, %%mm0\n" "por %%mm7, %%mm3\n" "por %%mm0, %%mm3\n" "movq %%mm4, %%mm5\n" "punpcklwd %%mm3, %%mm4\n" "punpckhwd %%mm3, %%mm5\n" "movq %%mm4, (%5)\n" "movq %%mm5, 8(%5)\n" "addl $8, %6\n" "addl $8, %2\n" "addl $4, %%ebx\n" "addl $4, %1\n" "cmpl %4, %6\n" "leal 16(%3), %3\n" "leal 16(%5),%5\n" // row2+16 320 "pand _MMX_red565, %%mm7\n"
319 "jl 1b\n" "addl %4, %2\n" // lum += cols 321 "pand _MMX_red565, %%mm3\n"
320 "addl %8, %3\n" // row1+= mod 322 "psrlw $11, %%mm7\n" // BLUE 2
321 "addl %8, %5\n" // row2+= mod 323 "pand _MMX_grn565, %%mm5\n"
322 "movl $0, %6\n" // x=0 324 "por %%mm7, %%mm3\n"
323 "cmpl %7, %2\n" 325 "movq (%2,%4), %%mm7\n" // L4 load lum2
324 "jl 1b\n" 326 "por %%mm5, %%mm3\n" //
325 "emms\n" 327 "psrlw $8, %%mm7\n" // L4
326 "popl %%ebx\n"::"m"(cr), "r"(cb), "r"(lum), 328 "movq %%mm4, %%mm5\n"
327 "r"(row1), "r"(cols), "r"(row2), "m"(x), 329 "punpcklwd %%mm3, %%mm4\n"
328 "m"(y), "m"(mod)); 330 "pmullw _MMX_Ycoeff, %%mm7\n" // lum4
331 "punpckhwd %%mm3, %%mm5\n"
332
333 "movq %%mm4, (%3)\n" // write row1
334 "movq %%mm5, 8(%3)\n" // write row1
335
336 "movq %%mm6, %%mm4\n" // Lum3
337 "paddw %%mm0, %%mm6\n" // Lum3 +blue
338
339 "movq %%mm4, %%mm5\n" // Lum3
340 "paddw %%mm1, %%mm4\n" // Lum3 +red
341 "paddw %%mm2, %%mm5\n" // Lum3 +green
342 "psraw $6, %%mm4\n"
343 "movq %%mm7, %%mm3\n" // Lum4
344 "psraw $6, %%mm5\n"
345 "paddw %%mm0, %%mm7\n" // Lum4 +blue
346 "psraw $6, %%mm6\n" // Lum3 +blue
347 "movq %%mm3, %%mm0\n" // Lum4
348 "packuswb %%mm4, %%mm4\n"
349 "paddw %%mm1, %%mm3\n" // Lum4 +red
350 "packuswb %%mm5, %%mm5\n"
351 "paddw %%mm2, %%mm0\n" // Lum4 +green
352 "packuswb %%mm6, %%mm6\n"
353 "punpcklbw %%mm4, %%mm4\n"
354 "punpcklbw %%mm5, %%mm5\n"
355 "punpcklbw %%mm6, %%mm6\n"
356 "psllw $3, %%mm5\n" // GREEN 3
357 "pand _MMX_red565, %%mm4\n"
358 "psraw $6, %%mm3\n" // psr 6
359 "psraw $6, %%mm0\n"
360 "pand _MMX_red565, %%mm6\n" // BLUE
361 "pand _MMX_grn565, %%mm5\n"
362 "psrlw $11, %%mm6\n" // BLUE 3
363 "por %%mm5, %%mm4\n"
364 "psraw $6, %%mm7\n"
365 "por %%mm6, %%mm4\n"
366 "packuswb %%mm3, %%mm3\n"
367 "packuswb %%mm0, %%mm0\n"
368 "packuswb %%mm7, %%mm7\n"
369 "punpcklbw %%mm3, %%mm3\n"
370 "punpcklbw %%mm0, %%mm0\n"
371 "punpcklbw %%mm7, %%mm7\n"
372 "pand _MMX_red565, %%mm3\n"
373 "pand _MMX_red565, %%mm7\n" // BLUE
374 "psllw $3, %%mm0\n" // GREEN 4
375 "psrlw $11, %%mm7\n"
376 "pand _MMX_grn565, %%mm0\n"
377 "por %%mm7, %%mm3\n"
378 "por %%mm0, %%mm3\n"
379
380 "movq %%mm4, %%mm5\n"
381
382 "punpcklwd %%mm3, %%mm4\n"
383 "punpckhwd %%mm3, %%mm5\n"
384
385 "movq %%mm4, (%5)\n"
386 "movq %%mm5, 8(%5)\n"
387
388 "addl $8, %6\n"
389 "addl $8, %2\n"
390 "addl $4, %%ebx\n"
391 "addl $4, %1\n"
392 "cmpl %4, %6\n"
393 "leal 16(%3), %3\n"
394 "leal 16(%5),%5\n" // row2+16
395
396
397 "jl 1b\n"
398 "addl %4, %2\n" // lum += cols
399 "addl %8, %3\n" // row1+= mod
400 "addl %8, %5\n" // row2+= mod
401 "movl $0, %6\n" // x=0
402 "cmpl %7, %2\n"
403 "jl 1b\n"
404 "emms\n"
405 "popl %%ebx\n"
406 :
407 :"m" (cr), "r"(cb),"r"(lum),
408 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod));
329 } 409 }
330 410
331 #endif /* GCC i386 inline assembly */ 411 #endif /* GCC i386 inline assembly */
332 #endif /* 0 */ 412 #endif /* 0 */
413
414 /* *INDENT-ON* */
415
333 /* vi: set ts=4 sw=4 expandtab: */ 416 /* vi: set ts=4 sw=4 expandtab: */