comparison src/video/SDL_yuv_mmx.c @ 1668:4da1ee79c9af SDL-1.3

more tweaking indent options
author Sam Lantinga <slouken@libsdl.org>
date Mon, 29 May 2006 04:04:35 +0000
parents 782fd950bd46
children
comparison
equal deleted inserted replaced
1667:1fddae038bc8 1668:4da1ee79c9af
26 26
27 #include "SDL_stdinc.h" 27 #include "SDL_stdinc.h"
28 28
29 #define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used)) 29 #define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used))
30 30
31 static unsigned int ASM_ARRAY (MMX_0080w) = { 31 static unsigned int ASM_ARRAY(MMX_0080w) = {
32 0x00800080, 0x00800080}; 32 0x00800080, 0x00800080};
33 static unsigned int ASM_ARRAY (MMX_00FFw) = { 33 static unsigned int ASM_ARRAY(MMX_00FFw) = {
34 0x00ff00ff, 0x00ff00ff}; 34 0x00ff00ff, 0x00ff00ff};
35 static unsigned int ASM_ARRAY (MMX_FF00w) = { 35 static unsigned int ASM_ARRAY(MMX_FF00w) = {
36 0xff00ff00, 0xff00ff00}; 36 0xff00ff00, 0xff00ff00};
37 37
38 static unsigned short ASM_ARRAY (MMX_Ycoeff) = { 38 static unsigned short ASM_ARRAY(MMX_Ycoeff) = {
39 0x004a, 0x004a, 0x004a, 0x004a}; 39 0x004a, 0x004a, 0x004a, 0x004a};
40 40
41 static unsigned short ASM_ARRAY (MMX_UbluRGB) = { 41 static unsigned short ASM_ARRAY(MMX_UbluRGB) = {
42 0x0072, 0x0072, 0x0072, 0x0072}; 42 0x0072, 0x0072, 0x0072, 0x0072};
43 static unsigned short ASM_ARRAY (MMX_VredRGB) = { 43 static unsigned short ASM_ARRAY(MMX_VredRGB) = {
44 0x0059, 0x0059, 0x0059, 0x0059}; 44 0x0059, 0x0059, 0x0059, 0x0059};
45 static unsigned short ASM_ARRAY (MMX_UgrnRGB) = { 45 static unsigned short ASM_ARRAY(MMX_UgrnRGB) = {
46 0xffea, 0xffea, 0xffea, 0xffea}; 46 0xffea, 0xffea, 0xffea, 0xffea};
47 static unsigned short ASM_ARRAY (MMX_VgrnRGB) = { 47 static unsigned short ASM_ARRAY(MMX_VgrnRGB) = {
48 0xffd2, 0xffd2, 0xffd2, 0xffd2}; 48 0xffd2, 0xffd2, 0xffd2, 0xffd2};
49 49
50 static unsigned short ASM_ARRAY (MMX_Ublu5x5) = { 50 static unsigned short ASM_ARRAY(MMX_Ublu5x5) = {
51 0x0081, 0x0081, 0x0081, 0x0081}; 51 0x0081, 0x0081, 0x0081, 0x0081};
52 static unsigned short ASM_ARRAY (MMX_Vred5x5) = { 52 static unsigned short ASM_ARRAY(MMX_Vred5x5) = {
53 0x0066, 0x0066, 0x0066, 0x0066}; 53 0x0066, 0x0066, 0x0066, 0x0066};
54 static unsigned short ASM_ARRAY (MMX_Ugrn555) = { 54 static unsigned short ASM_ARRAY(MMX_Ugrn555) = {
55 0xffe7, 0xffe7, 0xffe7, 0xffe7}; 55 0xffe7, 0xffe7, 0xffe7, 0xffe7};
56 static unsigned short ASM_ARRAY (MMX_Vgrn555) = { 56 static unsigned short ASM_ARRAY(MMX_Vgrn555) = {
57 0xffcc, 0xffcc, 0xffcc, 0xffcc}; 57 0xffcc, 0xffcc, 0xffcc, 0xffcc};
58 static unsigned short ASM_ARRAY (MMX_Ugrn565) = { 58 static unsigned short ASM_ARRAY(MMX_Ugrn565) = {
59 0xffe8, 0xffe8, 0xffe8, 0xffe8}; 59 0xffe8, 0xffe8, 0xffe8, 0xffe8};
60 static unsigned short ASM_ARRAY (MMX_Vgrn565) = { 60 static unsigned short ASM_ARRAY(MMX_Vgrn565) = {
61 0xffcd, 0xffcd, 0xffcd, 0xffcd}; 61 0xffcd, 0xffcd, 0xffcd, 0xffcd};
62 62
63 static unsigned short ASM_ARRAY (MMX_red555) = { 63 static unsigned short ASM_ARRAY(MMX_red555) = {
64 0x7c00, 0x7c00, 0x7c00, 0x7c00}; 64 0x7c00, 0x7c00, 0x7c00, 0x7c00};
65 static unsigned short ASM_ARRAY (MMX_red565) = { 65 static unsigned short ASM_ARRAY(MMX_red565) = {
66 0xf800, 0xf800, 0xf800, 0xf800}; 66 0xf800, 0xf800, 0xf800, 0xf800};
67 static unsigned short ASM_ARRAY (MMX_grn555) = { 67 static unsigned short ASM_ARRAY(MMX_grn555) = {
68 0x03e0, 0x03e0, 0x03e0, 0x03e0}; 68 0x03e0, 0x03e0, 0x03e0, 0x03e0};
69 static unsigned short ASM_ARRAY (MMX_grn565) = { 69 static unsigned short ASM_ARRAY(MMX_grn565) = {
70 0x07e0, 0x07e0, 0x07e0, 0x07e0}; 70 0x07e0, 0x07e0, 0x07e0, 0x07e0};
71 static unsigned short ASM_ARRAY (MMX_blu5x5) = { 71 static unsigned short ASM_ARRAY(MMX_blu5x5) = {
72 0x001f, 0x001f, 0x001f, 0x001f}; 72 0x001f, 0x001f, 0x001f, 0x001f};
73 73
74 /** 74 /**
75 This MMX assembler is my first assembler/MMX program ever. 75 This MMX assembler is my first assembler/MMX program ever.
76 Thus it maybe buggy. 76 Thus it maybe buggy.
99 the out are 16byte aligned or you will/may get segfaults 99 the out are 16byte aligned or you will/may get segfaults
100 100
101 */ 101 */
102 102
103 void 103 void
104 ColorRGBDitherYV12MMX1X (int *colortab, Uint32 * rgb_2_pix, 104 ColorRGBDitherYV12MMX1X(int *colortab, Uint32 * rgb_2_pix,
105 unsigned char *lum, unsigned char *cr, 105 unsigned char *lum, unsigned char *cr,
106 unsigned char *cb, unsigned char *out, 106 unsigned char *cb, unsigned char *out,
107 int rows, int cols, int mod) 107 int rows, int cols, int mod)
108 { 108 {
109 Uint32 *row1; 109 Uint32 *row1;
110 Uint32 *row2; 110 Uint32 *row2;
111 111
112 unsigned char *y = lum + cols * rows; // Pointer to the end 112 unsigned char *y = lum + cols * rows; // Pointer to the end
113 int x = 0; 113 int x = 0;
114 row1 = (Uint32 *) out; // 32 bit target 114 row1 = (Uint32 *) out; // 32 bit target
115 row2 = (Uint32 *) out + cols + mod; // start of second row 115 row2 = (Uint32 *) out + cols + mod; // start of second row
116 mod = (mod + cols + mod) * 4; // increment for row1 in byte 116 mod = (mod + cols + mod) * 4; // increment for row1 in byte
117 117
118 __asm__ __volatile__ ( 118 __asm__ __volatile__(
119 /* We don't really care about PIC - the code should be rewritten to use 119 /* We don't really care about PIC - the code should be rewritten to use
120 relative addressing for the static tables, so right now we take the 120 relative addressing for the static tables, so right now we take the
121 COW hit on the pages this code resides. Big deal. 121 COW hit on the pages this code resides. Big deal.
122 This spill is just to reduce register pressure in the PIC case. */ 122 This spill is just to reduce register pressure in the PIC case. */
123 "pushl %%ebx\n" 123 "pushl %%ebx\n"
124 "movl %0, %%ebx\n" ".align 8\n" "1:\n" 124 "movl %0, %%ebx\n" ".align 8\n" "1:\n"
125 // create Cr (result in mm1) 125 // create Cr (result in mm1)
126 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0 126 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0
127 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 127 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00
128 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0 128 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0
129 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0 129 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0
130 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0 130 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0
131 "psubw _MMX_0080w,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 131 "psubw _MMX_0080w,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0
132 // create Cr_g (result in mm0) 132 // create Cr_g (result in mm0)
133 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0 133 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0
134 "pmullw _MMX_VgrnRGB,%%mm0\n" // red*-46dec=0.7136*64 134 "pmullw _MMX_VgrnRGB,%%mm0\n" // red*-46dec=0.7136*64
135 "pmullw _MMX_VredRGB,%%mm1\n" // red*89dec=1.4013*64 135 "pmullw _MMX_VredRGB,%%mm1\n" // red*89dec=1.4013*64
136 "psraw $6, %%mm0\n" // red=red/64 136 "psraw $6, %%mm0\n" // red=red/64
137 "psraw $6, %%mm1\n" // red=red/64 137 "psraw $6, %%mm1\n" // red=red/64
138 // create L1 L2 (result in mm2,mm4) 138 // create L1 L2 (result in mm2,mm4)
139 // L2=lum+cols 139 // L2=lum+cols
140 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0 140 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0
141 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0 141 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0
142 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0 142 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0
143 "pand _MMX_FF00w,%%mm2\n" // L3 0 L1 0 l3 0 l1 0 143 "pand _MMX_FF00w,%%mm2\n" // L3 0 L1 0 l3 0 l1 0
144 "pand _MMX_00FFw,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0 144 "pand _MMX_00FFw,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0
145 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1 145 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1
146 // create R (result in mm6) 146 // create R (result in mm6)
147 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1 147 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1
148 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0 148 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0
149 "paddsw %%mm1, %%mm5\n" // lum1+red:x R3 x R1 x r3 x r1 149 "paddsw %%mm1, %%mm5\n" // lum1+red:x R3 x R1 x r3 x r1
150 "paddsw %%mm1, %%mm6\n" // lum1+red:x R2 x R0 x r2 x r0 150 "paddsw %%mm1, %%mm6\n" // lum1+red:x R2 x R0 x r2 x r0
151 "packuswb %%mm5,%%mm5\n" // R3 R1 r3 r1 R3 R1 r3 r1 151 "packuswb %%mm5,%%mm5\n" // R3 R1 r3 r1 R3 R1 r3 r1
152 "packuswb %%mm6,%%mm6\n" // R2 R0 r2 r0 R2 R0 r2 r0 152 "packuswb %%mm6,%%mm6\n" // R2 R0 r2 r0 R2 R0 r2 r0
153 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 153 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00
154 "punpcklbw %%mm5,%%mm6\n" // R3 R2 R1 R0 r3 r2 r1 r0 154 "punpcklbw %%mm5,%%mm6\n" // R3 R2 R1 R0 r3 r2 r1 r0
155 // create Cb (result in mm1) 155 // create Cb (result in mm1)
156 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0 156 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0
157 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0 157 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0
158 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0 158 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0
159 "psubw _MMX_0080w,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 159 "psubw _MMX_0080w,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0
160 // create Cb_g (result in mm5) 160 // create Cb_g (result in mm5)
161 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0 161 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0
162 "pmullw _MMX_UgrnRGB,%%mm5\n" // blue*-109dec=1.7129*64 162 "pmullw _MMX_UgrnRGB,%%mm5\n" // blue*-109dec=1.7129*64
163 "pmullw _MMX_UbluRGB,%%mm1\n" // blue*114dec=1.78125*64 163 "pmullw _MMX_UbluRGB,%%mm1\n" // blue*114dec=1.78125*64
164 "psraw $6, %%mm5\n" // blue=red/64 164 "psraw $6, %%mm5\n" // blue=red/64
165 "psraw $6, %%mm1\n" // blue=blue/64 165 "psraw $6, %%mm1\n" // blue=blue/64
166 // create G (result in mm7) 166 // create G (result in mm7)
167 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 167 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1
168 "movq %%mm4,%%mm7\n" // 0 L2 0 L0 0 l2 0 l1 168 "movq %%mm4,%%mm7\n" // 0 L2 0 L0 0 l2 0 l1
169 "paddsw %%mm5, %%mm3\n" // lum1+Cb_g:x G3t x G1t x g3t x g1t 169 "paddsw %%mm5, %%mm3\n" // lum1+Cb_g:x G3t x G1t x g3t x g1t
170 "paddsw %%mm5, %%mm7\n" // lum1+Cb_g:x G2t x G0t x g2t x g0t 170 "paddsw %%mm5, %%mm7\n" // lum1+Cb_g:x G2t x G0t x g2t x g0t
171 "paddsw %%mm0, %%mm3\n" // lum1+Cr_g:x G3 x G1 x g3 x g1 171 "paddsw %%mm0, %%mm3\n" // lum1+Cr_g:x G3 x G1 x g3 x g1
172 "paddsw %%mm0, %%mm7\n" // lum1+blue:x G2 x G0 x g2 x g0 172 "paddsw %%mm0, %%mm7\n" // lum1+blue:x G2 x G0 x g2 x g0
173 "packuswb %%mm3,%%mm3\n" // G3 G1 g3 g1 G3 G1 g3 g1 173 "packuswb %%mm3,%%mm3\n" // G3 G1 g3 g1 G3 G1 g3 g1
174 "packuswb %%mm7,%%mm7\n" // G2 G0 g2 g0 G2 G0 g2 g0 174 "packuswb %%mm7,%%mm7\n" // G2 G0 g2 g0 G2 G0 g2 g0
175 "punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0 175 "punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0
176 // create B (result in mm5) 176 // create B (result in mm5)
177 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 177 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1
178 "movq %%mm4,%%mm5\n" // 0 L2 0 L0 0 l2 0 l1 178 "movq %%mm4,%%mm5\n" // 0 L2 0 L0 0 l2 0 l1
179 "paddsw %%mm1, %%mm3\n" // lum1+blue:x B3 x B1 x b3 x b1 179 "paddsw %%mm1, %%mm3\n" // lum1+blue:x B3 x B1 x b3 x b1
180 "paddsw %%mm1, %%mm5\n" // lum1+blue:x B2 x B0 x b2 x b0 180 "paddsw %%mm1, %%mm5\n" // lum1+blue:x B2 x B0 x b2 x b0
181 "packuswb %%mm3,%%mm3\n" // B3 B1 b3 b1 B3 B1 b3 b1 181 "packuswb %%mm3,%%mm3\n" // B3 B1 b3 b1 B3 B1 b3 b1
182 "packuswb %%mm5,%%mm5\n" // B2 B0 b2 b0 B2 B0 b2 b0 182 "packuswb %%mm5,%%mm5\n" // B2 B0 b2 b0 B2 B0 b2 b0
183 "punpcklbw %%mm3,%%mm5\n" // B3 B2 B1 B0 b3 b2 b1 b0 183 "punpcklbw %%mm3,%%mm5\n" // B3 B2 B1 B0 b3 b2 b1 b0
184 // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb) 184 // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
185 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 185 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
186 "pxor %%mm4,%%mm4\n" // 0 0 0 0 0 0 0 0 186 "pxor %%mm4,%%mm4\n" // 0 0 0 0 0 0 0 0
187 "movq %%mm6,%%mm1\n" // R3 R2 R1 R0 r3 r2 r1 r0 187 "movq %%mm6,%%mm1\n" // R3 R2 R1 R0 r3 r2 r1 r0
188 "movq %%mm5,%%mm3\n" // B3 B2 B1 B0 b3 b2 b1 b0 188 "movq %%mm5,%%mm3\n" // B3 B2 B1 B0 b3 b2 b1 b0
189 // process lower lum 189 // process lower lum
190 "punpcklbw %%mm4,%%mm1\n" // 0 r3 0 r2 0 r1 0 r0 190 "punpcklbw %%mm4,%%mm1\n" // 0 r3 0 r2 0 r1 0 r0
191 "punpcklbw %%mm4,%%mm3\n" // 0 b3 0 b2 0 b1 0 b0 191 "punpcklbw %%mm4,%%mm3\n" // 0 b3 0 b2 0 b1 0 b0
192 "movq %%mm1,%%mm2\n" // 0 r3 0 r2 0 r1 0 r0 192 "movq %%mm1,%%mm2\n" // 0 r3 0 r2 0 r1 0 r0
193 "movq %%mm3,%%mm0\n" // 0 b3 0 b2 0 b1 0 b0 193 "movq %%mm3,%%mm0\n" // 0 b3 0 b2 0 b1 0 b0
194 "punpcklwd %%mm1,%%mm3\n" // 0 r1 0 b1 0 r0 0 b0 194 "punpcklwd %%mm1,%%mm3\n" // 0 r1 0 b1 0 r0 0 b0
195 "punpckhwd %%mm2,%%mm0\n" // 0 r3 0 b3 0 r2 0 b2 195 "punpckhwd %%mm2,%%mm0\n" // 0 r3 0 b3 0 r2 0 b2
196 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 196 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
197 "movq %%mm7,%%mm1\n" // G3 G2 G1 G0 g3 g2 g1 g0 197 "movq %%mm7,%%mm1\n" // G3 G2 G1 G0 g3 g2 g1 g0
198 "punpcklbw %%mm1,%%mm2\n" // g3 0 g2 0 g1 0 g0 0 198 "punpcklbw %%mm1,%%mm2\n" // g3 0 g2 0 g1 0 g0 0
199 "punpcklwd %%mm4,%%mm2\n" // 0 0 g1 0 0 0 g0 0 199 "punpcklwd %%mm4,%%mm2\n" // 0 0 g1 0 0 0 g0 0
200 "por %%mm3, %%mm2\n" // 0 r1 g1 b1 0 r0 g0 b0 200 "por %%mm3, %%mm2\n" // 0 r1 g1 b1 0 r0 g0 b0
201 "movq %%mm2,(%3)\n" // wrote out ! row1 201 "movq %%mm2,(%3)\n" // wrote out ! row1
202 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 202 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
203 "punpcklbw %%mm1,%%mm4\n" // g3 0 g2 0 g1 0 g0 0 203 "punpcklbw %%mm1,%%mm4\n" // g3 0 g2 0 g1 0 g0 0
204 "punpckhwd %%mm2,%%mm4\n" // 0 0 g3 0 0 0 g2 0 204 "punpckhwd %%mm2,%%mm4\n" // 0 0 g3 0 0 0 g2 0
205 "por %%mm0, %%mm4\n" // 0 r3 g3 b3 0 r2 g2 b2 205 "por %%mm0, %%mm4\n" // 0 r3 g3 b3 0 r2 g2 b2
206 "movq %%mm4,8(%3)\n" // wrote out ! row1 206 "movq %%mm4,8(%3)\n" // wrote out ! row1
207 // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb) 207 // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
208 // this can be done "destructive" 208 // this can be done "destructive"
209 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 209 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
210 "punpckhbw %%mm2,%%mm6\n" // 0 R3 0 R2 0 R1 0 R0 210 "punpckhbw %%mm2,%%mm6\n" // 0 R3 0 R2 0 R1 0 R0
211 "punpckhbw %%mm1,%%mm5\n" // G3 B3 G2 B2 G1 B1 G0 B0 211 "punpckhbw %%mm1,%%mm5\n" // G3 B3 G2 B2 G1 B1 G0 B0
212 "movq %%mm5,%%mm1\n" // G3 B3 G2 B2 G1 B1 G0 B0 212 "movq %%mm5,%%mm1\n" // G3 B3 G2 B2 G1 B1 G0 B0
213 "punpcklwd %%mm6,%%mm1\n" // 0 R1 G1 B1 0 R0 G0 B0 213 "punpcklwd %%mm6,%%mm1\n" // 0 R1 G1 B1 0 R0 G0 B0
214 "movq %%mm1,(%5)\n" // wrote out ! row2 214 "movq %%mm1,(%5)\n" // wrote out ! row2
215 "punpckhwd %%mm6,%%mm5\n" // 0 R3 G3 B3 0 R2 G2 B2 215 "punpckhwd %%mm6,%%mm5\n" // 0 R3 G3 B3 0 R2 G2 B2
216 "movq %%mm5,8(%5)\n" // wrote out ! row2 216 "movq %%mm5,8(%5)\n" // wrote out ! row2
217 "addl $4,%2\n" // lum+4 217 "addl $4,%2\n" // lum+4
218 "leal 16(%3),%3\n" // row1+16 218 "leal 16(%3),%3\n" // row1+16
219 "leal 16(%5),%5\n" // row2+16 219 "leal 16(%5),%5\n" // row2+16
220 "addl $2, %%ebx\n" // cr+2 220 "addl $2, %%ebx\n" // cr+2
221 "addl $2, %1\n" // cb+2 221 "addl $2, %1\n" // cb+2
222 "addl $4,%6\n" // x+4 222 "addl $4,%6\n" // x+4
223 "cmpl %4,%6\n" "jl 1b\n" "addl %4, %2\n" // lum += cols 223 "cmpl %4,%6\n" "jl 1b\n" "addl %4, %2\n" // lum += cols
224 "addl %8, %3\n" // row1+= mod 224 "addl %8, %3\n" // row1+= mod
225 "addl %8, %5\n" // row2+= mod 225 "addl %8, %5\n" // row2+= mod
226 "movl $0, %6\n" // x=0 226 "movl $0, %6\n" // x=0
227 "cmpl %7, %2\n" 227 "cmpl %7, %2\n"
228 "jl 1b\n" 228 "jl 1b\n"
229 "emms\n" 229 "emms\n"
230 "popl %%ebx\n"::"m" (cr), "r" (cb), "r" (lum), 230 "popl %%ebx\n"::"m"(cr), "r"(cb), "r"(lum),
231 "r" (row1), "r" (cols), "r" (row2), "m" (x), 231 "r"(row1), "r"(cols), "r"(row2), "m"(x),
232 "m" (y), "m" (mod)); 232 "m"(y), "m"(mod));
233 } 233 }
234 234
235 void 235 void
236 Color565DitherYV12MMX1X (int *colortab, Uint32 * rgb_2_pix, 236 Color565DitherYV12MMX1X(int *colortab, Uint32 * rgb_2_pix,
237 unsigned char *lum, unsigned char *cr, 237 unsigned char *lum, unsigned char *cr,
238 unsigned char *cb, unsigned char *out, 238 unsigned char *cb, unsigned char *out,
239 int rows, int cols, int mod) 239 int rows, int cols, int mod)
240 { 240 {
241 Uint16 *row1; 241 Uint16 *row1;
242 Uint16 *row2; 242 Uint16 *row2;
243 243
244 unsigned char *y = lum + cols * rows; /* Pointer to the end */ 244 unsigned char *y = lum + cols * rows; /* Pointer to the end */
246 row1 = (Uint16 *) out; /* 16 bit target */ 246 row1 = (Uint16 *) out; /* 16 bit target */
247 row2 = (Uint16 *) out + cols + mod; /* start of second row */ 247 row2 = (Uint16 *) out + cols + mod; /* start of second row */
248 mod = (mod + cols + mod) * 2; /* increment for row1 in byte */ 248 mod = (mod + cols + mod) * 2; /* increment for row1 in byte */
249 249
250 250
251 __asm__ __volatile__ ("pushl %%ebx\n" "movl %0, %%ebx\n" ".align 8\n" "1:\n" "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0 251 __asm__ __volatile__("pushl %%ebx\n" "movl %0, %%ebx\n" ".align 8\n" "1:\n" "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0
252 "pxor %%mm7, %%mm7\n" "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0 252 "pxor %%mm7, %%mm7\n" "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0
253 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0 253 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0
254 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0 254 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0
255 "psubw _MMX_0080w, %%mm0\n" "psubw _MMX_0080w, %%mm1\n" "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0 255 "psubw _MMX_0080w, %%mm0\n" "psubw _MMX_0080w, %%mm1\n" "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0
256 "movq %%mm1, %%mm3\n" // Cr 256 "movq %%mm1, %%mm3\n" // Cr
257 "pmullw _MMX_Ugrn565, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0 257 "pmullw _MMX_Ugrn565, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0
258 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0 258 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0
259 "pmullw _MMX_Ublu5x5, %%mm0\n" // Cb2blue 259 "pmullw _MMX_Ublu5x5, %%mm0\n" // Cb2blue
260 "pand _MMX_00FFw, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0 260 "pand _MMX_00FFw, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0
261 "pmullw _MMX_Vgrn565, %%mm3\n" // Cr2green 261 "pmullw _MMX_Vgrn565, %%mm3\n" // Cr2green
262 "movq (%2), %%mm7\n" // L2 262 "movq (%2), %%mm7\n" // L2
263 "pmullw _MMX_Vred5x5, %%mm1\n" // Cr2red 263 "pmullw _MMX_Vred5x5, %%mm1\n" // Cr2red
264 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1 264 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1
265 "pmullw _MMX_Ycoeff, %%mm6\n" // lum1 265 "pmullw _MMX_Ycoeff, %%mm6\n" // lum1
266 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green 266 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green
267 "pmullw _MMX_Ycoeff, %%mm7\n" // lum2 267 "pmullw _MMX_Ycoeff, %%mm7\n" // lum2
268 "movq %%mm6, %%mm4\n" // lum1 268 "movq %%mm6, %%mm4\n" // lum1
269 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0 269 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0
270 "movq %%mm4, %%mm5\n" // lum1 270 "movq %%mm4, %%mm5\n" // lum1
271 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0 271 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0
272 "paddw %%mm2, %%mm5\n" // lum1 +green 00 G6 00 G4 00 G2 00 G0 272 "paddw %%mm2, %%mm5\n" // lum1 +green 00 G6 00 G4 00 G2 00 G0
273 "psraw $6, %%mm4\n" // R1 0 .. 64 273 "psraw $6, %%mm4\n" // R1 0 .. 64
274 "movq %%mm7, %%mm3\n" // lum2 00 L7 00 L5 00 L3 00 L1 274 "movq %%mm7, %%mm3\n" // lum2 00 L7 00 L5 00 L3 00 L1
275 "psraw $6, %%mm5\n" // G1 - .. + 275 "psraw $6, %%mm5\n" // G1 - .. +
276 "paddw %%mm0, %%mm7\n" // Lum2 +blue 00 B7 00 B5 00 B3 00 B1 276 "paddw %%mm0, %%mm7\n" // Lum2 +blue 00 B7 00 B5 00 B3 00 B1
277 "psraw $6, %%mm6\n" // B1 0 .. 64 277 "psraw $6, %%mm6\n" // B1 0 .. 64
278 "packuswb %%mm4, %%mm4\n" // R1 R1 278 "packuswb %%mm4, %%mm4\n" // R1 R1
279 "packuswb %%mm5, %%mm5\n" // G1 G1 279 "packuswb %%mm5, %%mm5\n" // G1 G1
280 "packuswb %%mm6, %%mm6\n" // B1 B1 280 "packuswb %%mm6, %%mm6\n" // B1 B1
281 "punpcklbw %%mm4, %%mm4\n" "punpcklbw %%mm5, %%mm5\n" "pand _MMX_red565, %%mm4\n" "psllw $3, %%mm5\n" // GREEN 1 281 "punpcklbw %%mm4, %%mm4\n" "punpcklbw %%mm5, %%mm5\n" "pand _MMX_red565, %%mm4\n" "psllw $3, %%mm5\n" // GREEN 1
282 "punpcklbw %%mm6, %%mm6\n" "pand _MMX_grn565, %%mm5\n" "pand _MMX_red565, %%mm6\n" "por %%mm5, %%mm4\n" // 282 "punpcklbw %%mm6, %%mm6\n" "pand _MMX_grn565, %%mm5\n" "pand _MMX_red565, %%mm6\n" "por %%mm5, %%mm4\n" //
283 "psrlw $11, %%mm6\n" // BLUE 1 283 "psrlw $11, %%mm6\n" // BLUE 1
284 "movq %%mm3, %%mm5\n" // lum2 284 "movq %%mm3, %%mm5\n" // lum2
285 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1 285 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1
286 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1 286 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1
287 "psraw $6, %%mm3\n" // R2 287 "psraw $6, %%mm3\n" // R2
288 "por %%mm6, %%mm4\n" // MM4 288 "por %%mm6, %%mm4\n" // MM4
289 "psraw $6, %%mm5\n" // G2 289 "psraw $6, %%mm5\n" // G2
290 "movq (%2, %4), %%mm6\n" // L3 load lum2 290 "movq (%2, %4), %%mm6\n" // L3 load lum2
291 "psraw $6, %%mm7\n" "packuswb %%mm3, %%mm3\n" "packuswb %%mm5, %%mm5\n" "packuswb %%mm7, %%mm7\n" "pand _MMX_00FFw, %%mm6\n" // L3 291 "psraw $6, %%mm7\n" "packuswb %%mm3, %%mm3\n" "packuswb %%mm5, %%mm5\n" "packuswb %%mm7, %%mm7\n" "pand _MMX_00FFw, %%mm6\n" // L3
292 "punpcklbw %%mm3, %%mm3\n" "punpcklbw %%mm5, %%mm5\n" "pmullw _MMX_Ycoeff, %%mm6\n" // lum3 292 "punpcklbw %%mm3, %%mm3\n" "punpcklbw %%mm5, %%mm5\n" "pmullw _MMX_Ycoeff, %%mm6\n" // lum3
293 "punpcklbw %%mm7, %%mm7\n" "psllw $3, %%mm5\n" // GREEN 2 293 "punpcklbw %%mm7, %%mm7\n" "psllw $3, %%mm5\n" // GREEN 2
294 "pand _MMX_red565, %%mm7\n" "pand _MMX_red565, %%mm3\n" "psrlw $11, %%mm7\n" // BLUE 2 294 "pand _MMX_red565, %%mm7\n" "pand _MMX_red565, %%mm3\n" "psrlw $11, %%mm7\n" // BLUE 2
295 "pand _MMX_grn565, %%mm5\n" "por %%mm7, %%mm3\n" "movq (%2,%4), %%mm7\n" // L4 load lum2 295 "pand _MMX_grn565, %%mm5\n" "por %%mm7, %%mm3\n" "movq (%2,%4), %%mm7\n" // L4 load lum2
296 "por %%mm5, %%mm3\n" // 296 "por %%mm5, %%mm3\n" //
297 "psrlw $8, %%mm7\n" // L4 297 "psrlw $8, %%mm7\n" // L4
298 "movq %%mm4, %%mm5\n" "punpcklwd %%mm3, %%mm4\n" "pmullw _MMX_Ycoeff, %%mm7\n" // lum4 298 "movq %%mm4, %%mm5\n" "punpcklwd %%mm3, %%mm4\n" "pmullw _MMX_Ycoeff, %%mm7\n" // lum4
299 "punpckhwd %%mm3, %%mm5\n" "movq %%mm4, (%3)\n" // write row1 299 "punpckhwd %%mm3, %%mm5\n" "movq %%mm4, (%3)\n" // write row1
300 "movq %%mm5, 8(%3)\n" // write row1 300 "movq %%mm5, 8(%3)\n" // write row1
301 "movq %%mm6, %%mm4\n" // Lum3 301 "movq %%mm6, %%mm4\n" // Lum3
302 "paddw %%mm0, %%mm6\n" // Lum3 +blue 302 "paddw %%mm0, %%mm6\n" // Lum3 +blue
303 "movq %%mm4, %%mm5\n" // Lum3 303 "movq %%mm4, %%mm5\n" // Lum3
304 "paddw %%mm1, %%mm4\n" // Lum3 +red 304 "paddw %%mm1, %%mm4\n" // Lum3 +red
305 "paddw %%mm2, %%mm5\n" // Lum3 +green 305 "paddw %%mm2, %%mm5\n" // Lum3 +green
306 "psraw $6, %%mm4\n" "movq %%mm7, %%mm3\n" // Lum4 306 "psraw $6, %%mm4\n" "movq %%mm7, %%mm3\n" // Lum4
307 "psraw $6, %%mm5\n" "paddw %%mm0, %%mm7\n" // Lum4 +blue 307 "psraw $6, %%mm5\n" "paddw %%mm0, %%mm7\n" // Lum4 +blue
308 "psraw $6, %%mm6\n" // Lum3 +blue 308 "psraw $6, %%mm6\n" // Lum3 +blue
309 "movq %%mm3, %%mm0\n" // Lum4 309 "movq %%mm3, %%mm0\n" // Lum4
310 "packuswb %%mm4, %%mm4\n" "paddw %%mm1, %%mm3\n" // Lum4 +red 310 "packuswb %%mm4, %%mm4\n" "paddw %%mm1, %%mm3\n" // Lum4 +red
311 "packuswb %%mm5, %%mm5\n" "paddw %%mm2, %%mm0\n" // Lum4 +green 311 "packuswb %%mm5, %%mm5\n" "paddw %%mm2, %%mm0\n" // Lum4 +green
312 "packuswb %%mm6, %%mm6\n" "punpcklbw %%mm4, %%mm4\n" "punpcklbw %%mm5, %%mm5\n" "punpcklbw %%mm6, %%mm6\n" "psllw $3, %%mm5\n" // GREEN 3 312 "packuswb %%mm6, %%mm6\n" "punpcklbw %%mm4, %%mm4\n" "punpcklbw %%mm5, %%mm5\n" "punpcklbw %%mm6, %%mm6\n" "psllw $3, %%mm5\n" // GREEN 3
313 "pand _MMX_red565, %%mm4\n" "psraw $6, %%mm3\n" // psr 6 313 "pand _MMX_red565, %%mm4\n" "psraw $6, %%mm3\n" // psr 6
314 "psraw $6, %%mm0\n" "pand _MMX_red565, %%mm6\n" // BLUE 314 "psraw $6, %%mm0\n" "pand _MMX_red565, %%mm6\n" // BLUE
315 "pand _MMX_grn565, %%mm5\n" "psrlw $11, %%mm6\n" // BLUE 3 315 "pand _MMX_grn565, %%mm5\n" "psrlw $11, %%mm6\n" // BLUE 3
316 "por %%mm5, %%mm4\n" "psraw $6, %%mm7\n" "por %%mm6, %%mm4\n" "packuswb %%mm3, %%mm3\n" "packuswb %%mm0, %%mm0\n" "packuswb %%mm7, %%mm7\n" "punpcklbw %%mm3, %%mm3\n" "punpcklbw %%mm0, %%mm0\n" "punpcklbw %%mm7, %%mm7\n" "pand _MMX_red565, %%mm3\n" "pand _MMX_red565, %%mm7\n" // BLUE 316 "por %%mm5, %%mm4\n" "psraw $6, %%mm7\n" "por %%mm6, %%mm4\n" "packuswb %%mm3, %%mm3\n" "packuswb %%mm0, %%mm0\n" "packuswb %%mm7, %%mm7\n" "punpcklbw %%mm3, %%mm3\n" "punpcklbw %%mm0, %%mm0\n" "punpcklbw %%mm7, %%mm7\n" "pand _MMX_red565, %%mm3\n" "pand _MMX_red565, %%mm7\n" // BLUE
317 "psllw $3, %%mm0\n" // GREEN 4 317 "psllw $3, %%mm0\n" // GREEN 4
318 "psrlw $11, %%mm7\n" "pand _MMX_grn565, %%mm0\n" "por %%mm7, %%mm3\n" "por %%mm0, %%mm3\n" "movq %%mm4, %%mm5\n" "punpcklwd %%mm3, %%mm4\n" "punpckhwd %%mm3, %%mm5\n" "movq %%mm4, (%5)\n" "movq %%mm5, 8(%5)\n" "addl $8, %6\n" "addl $8, %2\n" "addl $4, %%ebx\n" "addl $4, %1\n" "cmpl %4, %6\n" "leal 16(%3), %3\n" "leal 16(%5),%5\n" // row2+16 318 "psrlw $11, %%mm7\n" "pand _MMX_grn565, %%mm0\n" "por %%mm7, %%mm3\n" "por %%mm0, %%mm3\n" "movq %%mm4, %%mm5\n" "punpcklwd %%mm3, %%mm4\n" "punpckhwd %%mm3, %%mm5\n" "movq %%mm4, (%5)\n" "movq %%mm5, 8(%5)\n" "addl $8, %6\n" "addl $8, %2\n" "addl $4, %%ebx\n" "addl $4, %1\n" "cmpl %4, %6\n" "leal 16(%3), %3\n" "leal 16(%5),%5\n" // row2+16
319 "jl 1b\n" "addl %4, %2\n" // lum += cols 319 "jl 1b\n" "addl %4, %2\n" // lum += cols
320 "addl %8, %3\n" // row1+= mod 320 "addl %8, %3\n" // row1+= mod
321 "addl %8, %5\n" // row2+= mod 321 "addl %8, %5\n" // row2+= mod
322 "movl $0, %6\n" // x=0 322 "movl $0, %6\n" // x=0
323 "cmpl %7, %2\n" 323 "cmpl %7, %2\n"
324 "jl 1b\n" 324 "jl 1b\n"
325 "emms\n" 325 "emms\n"
326 "popl %%ebx\n"::"m" (cr), "r" (cb), "r" (lum), 326 "popl %%ebx\n"::"m"(cr), "r"(cb), "r"(lum),
327 "r" (row1), "r" (cols), "r" (row2), "m" (x), 327 "r"(row1), "r"(cols), "r"(row2), "m"(x),
328 "m" (y), "m" (mod)); 328 "m"(y), "m"(mod));
329 } 329 }
330 330
331 #endif /* GCC i386 inline assembly */ 331 #endif /* GCC i386 inline assembly */
332 #endif /* 0 */ 332 #endif /* 0 */
333 /* vi: set ts=4 sw=4 expandtab: */ 333 /* vi: set ts=4 sw=4 expandtab: */