comparison src/video/SDL_yuv_mmx.c @ 1662:782fd950bd46 SDL-1.3

Revamp of the video system in progress - adding support for multiple displays, multiple windows, and a full video mode selection API. WARNING: None of the video drivers have been updated for the new API yet! The API is still under design and very fluid. The code is now run through a consistent indent format: indent -i4 -nut -nsc -br -ce The headers are being converted to automatically generate doxygen documentation.
author Sam Lantinga <slouken@libsdl.org>
date Sun, 28 May 2006 13:04:16 +0000
parents 40edc79b0926
children 4da1ee79c9af
comparison
equal deleted inserted replaced
1661:281d3f4870e5 1662:782fd950bd46
19 Sam Lantinga 19 Sam Lantinga
20 slouken@libsdl.org 20 slouken@libsdl.org
21 */ 21 */
22 #include "SDL_config.h" 22 #include "SDL_config.h"
23 23
24 #if 0 /* FIXME: This code needs to be rewritten to reference the static data using relocatable addresses (e.g. http://www.gentoo.org/proj/en/hardened/pic-fix-guide.xml or http://nasm.sourceforge.net/doc/html/nasmdoc8.html#section-8.2) This code currently breaks on systems with readonly text segments (hardened Linux / Intel Mac) */ 24 #if 0 /* FIXME: This code needs to be rewritten to reference the static data using relocatable addresses (e.g. http://www.gentoo.org/proj/en/hardened/pic-fix-guide.xml or http://nasm.sourceforge.net/doc/html/nasmdoc8.html#section-8.2) This code currently breaks on systems with readonly text segments (hardened Linux / Intel Mac) */
25 #if defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES 25 #if defined(__GNUC__) && defined(__i386__) && SDL_ASSEMBLY_ROUTINES
26 26
27 #include "SDL_stdinc.h" 27 #include "SDL_stdinc.h"
28 28
29 #define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used)) 29 #define ASM_ARRAY(x) x[] __asm__("_" #x) __attribute__((used))
30 30
31 static unsigned int ASM_ARRAY(MMX_0080w) = {0x00800080, 0x00800080}; 31 static unsigned int ASM_ARRAY (MMX_0080w) = {
32 static unsigned int ASM_ARRAY(MMX_00FFw) = {0x00ff00ff, 0x00ff00ff}; 32 0x00800080, 0x00800080};
33 static unsigned int ASM_ARRAY(MMX_FF00w) = {0xff00ff00, 0xff00ff00}; 33 static unsigned int ASM_ARRAY (MMX_00FFw) = {
34 34 0x00ff00ff, 0x00ff00ff};
35 static unsigned short ASM_ARRAY(MMX_Ycoeff) = {0x004a, 0x004a, 0x004a, 0x004a}; 35 static unsigned int ASM_ARRAY (MMX_FF00w) = {
36 36 0xff00ff00, 0xff00ff00};
37 static unsigned short ASM_ARRAY(MMX_UbluRGB) = {0x0072, 0x0072, 0x0072, 0x0072}; 37
38 static unsigned short ASM_ARRAY(MMX_VredRGB) = {0x0059, 0x0059, 0x0059, 0x0059}; 38 static unsigned short ASM_ARRAY (MMX_Ycoeff) = {
39 static unsigned short ASM_ARRAY(MMX_UgrnRGB) = {0xffea, 0xffea, 0xffea, 0xffea}; 39 0x004a, 0x004a, 0x004a, 0x004a};
40 static unsigned short ASM_ARRAY(MMX_VgrnRGB) = {0xffd2, 0xffd2, 0xffd2, 0xffd2}; 40
41 41 static unsigned short ASM_ARRAY (MMX_UbluRGB) = {
42 static unsigned short ASM_ARRAY(MMX_Ublu5x5) = {0x0081, 0x0081, 0x0081, 0x0081}; 42 0x0072, 0x0072, 0x0072, 0x0072};
43 static unsigned short ASM_ARRAY(MMX_Vred5x5) = {0x0066, 0x0066, 0x0066, 0x0066}; 43 static unsigned short ASM_ARRAY (MMX_VredRGB) = {
44 static unsigned short ASM_ARRAY(MMX_Ugrn555) = {0xffe7, 0xffe7, 0xffe7, 0xffe7}; 44 0x0059, 0x0059, 0x0059, 0x0059};
45 static unsigned short ASM_ARRAY(MMX_Vgrn555) = {0xffcc, 0xffcc, 0xffcc, 0xffcc}; 45 static unsigned short ASM_ARRAY (MMX_UgrnRGB) = {
46 static unsigned short ASM_ARRAY(MMX_Ugrn565) = {0xffe8, 0xffe8, 0xffe8, 0xffe8}; 46 0xffea, 0xffea, 0xffea, 0xffea};
47 static unsigned short ASM_ARRAY(MMX_Vgrn565) = {0xffcd, 0xffcd, 0xffcd, 0xffcd}; 47 static unsigned short ASM_ARRAY (MMX_VgrnRGB) = {
48 48 0xffd2, 0xffd2, 0xffd2, 0xffd2};
49 static unsigned short ASM_ARRAY(MMX_red555) = {0x7c00, 0x7c00, 0x7c00, 0x7c00}; 49
50 static unsigned short ASM_ARRAY(MMX_red565) = {0xf800, 0xf800, 0xf800, 0xf800}; 50 static unsigned short ASM_ARRAY (MMX_Ublu5x5) = {
51 static unsigned short ASM_ARRAY(MMX_grn555) = {0x03e0, 0x03e0, 0x03e0, 0x03e0}; 51 0x0081, 0x0081, 0x0081, 0x0081};
52 static unsigned short ASM_ARRAY(MMX_grn565) = {0x07e0, 0x07e0, 0x07e0, 0x07e0}; 52 static unsigned short ASM_ARRAY (MMX_Vred5x5) = {
53 static unsigned short ASM_ARRAY(MMX_blu5x5) = {0x001f, 0x001f, 0x001f, 0x001f}; 53 0x0066, 0x0066, 0x0066, 0x0066};
54 static unsigned short ASM_ARRAY (MMX_Ugrn555) = {
55 0xffe7, 0xffe7, 0xffe7, 0xffe7};
56 static unsigned short ASM_ARRAY (MMX_Vgrn555) = {
57 0xffcc, 0xffcc, 0xffcc, 0xffcc};
58 static unsigned short ASM_ARRAY (MMX_Ugrn565) = {
59 0xffe8, 0xffe8, 0xffe8, 0xffe8};
60 static unsigned short ASM_ARRAY (MMX_Vgrn565) = {
61 0xffcd, 0xffcd, 0xffcd, 0xffcd};
62
63 static unsigned short ASM_ARRAY (MMX_red555) = {
64 0x7c00, 0x7c00, 0x7c00, 0x7c00};
65 static unsigned short ASM_ARRAY (MMX_red565) = {
66 0xf800, 0xf800, 0xf800, 0xf800};
67 static unsigned short ASM_ARRAY (MMX_grn555) = {
68 0x03e0, 0x03e0, 0x03e0, 0x03e0};
69 static unsigned short ASM_ARRAY (MMX_grn565) = {
70 0x07e0, 0x07e0, 0x07e0, 0x07e0};
71 static unsigned short ASM_ARRAY (MMX_blu5x5) = {
72 0x001f, 0x001f, 0x001f, 0x001f};
54 73
55 /** 74 /**
56 This MMX assembler is my first assembler/MMX program ever. 75 This MMX assembler is my first assembler/MMX program ever.
57 Thus it maybe buggy. 76 Thus it maybe buggy.
58 Send patches to: 77 Send patches to:
79 It is a requirement that the cr/cb/lum are 8 byte aligned and 98 It is a requirement that the cr/cb/lum are 8 byte aligned and
80 the out are 16byte aligned or you will/may get segfaults 99 the out are 16byte aligned or you will/may get segfaults
81 100
82 */ 101 */
83 102
84 void ColorRGBDitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, 103 void
85 unsigned char *lum, unsigned char *cr, 104 ColorRGBDitherYV12MMX1X (int *colortab, Uint32 * rgb_2_pix,
86 unsigned char *cb, unsigned char *out, 105 unsigned char *lum, unsigned char *cr,
87 int rows, int cols, int mod ) 106 unsigned char *cb, unsigned char *out,
107 int rows, int cols, int mod)
88 { 108 {
89 Uint32 *row1; 109 Uint32 *row1;
90 Uint32 *row2; 110 Uint32 *row2;
91 111
92 unsigned char* y = lum +cols*rows; // Pointer to the end 112 unsigned char *y = lum + cols * rows; // Pointer to the end
93 int x=0; 113 int x = 0;
94 row1 = (Uint32 *)out; // 32 bit target 114 row1 = (Uint32 *) out; // 32 bit target
95 row2 = (Uint32 *)out+cols+mod; // start of second row 115 row2 = (Uint32 *) out + cols + mod; // start of second row
96 mod = (mod+cols+mod)*4; // increment for row1 in byte 116 mod = (mod + cols + mod) * 4; // increment for row1 in byte
97 117
98 __asm__ __volatile__ ( 118 __asm__ __volatile__ (
99 /* We don't really care about PIC - the code should be rewritten to use 119 /* We don't really care about PIC - the code should be rewritten to use
100 relative addressing for the static tables, so right now we take the 120 relative addressing for the static tables, so right now we take the
101 COW hit on the pages this code resides. Big deal. 121 COW hit on the pages this code resides. Big deal.
102 This spill is just to reduce register pressure in the PIC case. */ 122 This spill is just to reduce register pressure in the PIC case. */
103 "pushl %%ebx\n" 123 "pushl %%ebx\n"
104 "movl %0, %%ebx\n" 124 "movl %0, %%ebx\n" ".align 8\n" "1:\n"
105 125 // create Cr (result in mm1)
106 ".align 8\n" 126 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0
107 "1:\n" 127 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00
108 128 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0
109 // create Cr (result in mm1) 129 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0
110 "movd (%%ebx), %%mm1\n" // 0 0 0 0 v3 v2 v1 v0 130 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0
111 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 131 "psubw _MMX_0080w,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0
112 "movd (%2), %%mm2\n" // 0 0 0 0 l3 l2 l1 l0 132 // create Cr_g (result in mm0)
113 "punpcklbw %%mm7,%%mm1\n" // 0 v3 0 v2 00 v1 00 v0 133 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0
114 "punpckldq %%mm1,%%mm1\n" // 00 v1 00 v0 00 v1 00 v0 134 "pmullw _MMX_VgrnRGB,%%mm0\n" // red*-46dec=0.7136*64
115 "psubw _MMX_0080w,%%mm1\n" // mm1-128:r1 r1 r0 r0 r1 r1 r0 r0 135 "pmullw _MMX_VredRGB,%%mm1\n" // red*89dec=1.4013*64
116 136 "psraw $6, %%mm0\n" // red=red/64
117 // create Cr_g (result in mm0) 137 "psraw $6, %%mm1\n" // red=red/64
118 "movq %%mm1,%%mm0\n" // r1 r1 r0 r0 r1 r1 r0 r0 138 // create L1 L2 (result in mm2,mm4)
119 "pmullw _MMX_VgrnRGB,%%mm0\n"// red*-46dec=0.7136*64 139 // L2=lum+cols
120 "pmullw _MMX_VredRGB,%%mm1\n"// red*89dec=1.4013*64 140 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0
121 "psraw $6, %%mm0\n" // red=red/64 141 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0
122 "psraw $6, %%mm1\n" // red=red/64 142 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0
123 143 "pand _MMX_FF00w,%%mm2\n" // L3 0 L1 0 l3 0 l1 0
124 // create L1 L2 (result in mm2,mm4) 144 "pand _MMX_00FFw,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0
125 // L2=lum+cols 145 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1
126 "movq (%2,%4),%%mm3\n" // 0 0 0 0 L3 L2 L1 L0 146 // create R (result in mm6)
127 "punpckldq %%mm3,%%mm2\n" // L3 L2 L1 L0 l3 l2 l1 l0 147 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1
128 "movq %%mm2,%%mm4\n" // L3 L2 L1 L0 l3 l2 l1 l0 148 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0
129 "pand _MMX_FF00w,%%mm2\n" // L3 0 L1 0 l3 0 l1 0 149 "paddsw %%mm1, %%mm5\n" // lum1+red:x R3 x R1 x r3 x r1
130 "pand _MMX_00FFw,%%mm4\n" // 0 L2 0 L0 0 l2 0 l0 150 "paddsw %%mm1, %%mm6\n" // lum1+red:x R2 x R0 x r2 x r0
131 "psrlw $8,%%mm2\n" // 0 L3 0 L1 0 l3 0 l1 151 "packuswb %%mm5,%%mm5\n" // R3 R1 r3 r1 R3 R1 r3 r1
132 152 "packuswb %%mm6,%%mm6\n" // R2 R0 r2 r0 R2 R0 r2 r0
133 // create R (result in mm6) 153 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00
134 "movq %%mm2,%%mm5\n" // 0 L3 0 L1 0 l3 0 l1 154 "punpcklbw %%mm5,%%mm6\n" // R3 R2 R1 R0 r3 r2 r1 r0
135 "movq %%mm4,%%mm6\n" // 0 L2 0 L0 0 l2 0 l0 155 // create Cb (result in mm1)
136 "paddsw %%mm1, %%mm5\n" // lum1+red:x R3 x R1 x r3 x r1 156 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0
137 "paddsw %%mm1, %%mm6\n" // lum1+red:x R2 x R0 x r2 x r0 157 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0
138 "packuswb %%mm5,%%mm5\n" // R3 R1 r3 r1 R3 R1 r3 r1 158 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0
139 "packuswb %%mm6,%%mm6\n" // R2 R0 r2 r0 R2 R0 r2 r0 159 "psubw _MMX_0080w,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0
140 "pxor %%mm7,%%mm7\n" // 00 00 00 00 00 00 00 00 160 // create Cb_g (result in mm5)
141 "punpcklbw %%mm5,%%mm6\n" // R3 R2 R1 R0 r3 r2 r1 r0 161 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0
142 162 "pmullw _MMX_UgrnRGB,%%mm5\n" // blue*-109dec=1.7129*64
143 // create Cb (result in mm1) 163 "pmullw _MMX_UbluRGB,%%mm1\n" // blue*114dec=1.78125*64
144 "movd (%1), %%mm1\n" // 0 0 0 0 u3 u2 u1 u0 164 "psraw $6, %%mm5\n" // blue=red/64
145 "punpcklbw %%mm7,%%mm1\n" // 0 u3 0 u2 00 u1 00 u0 165 "psraw $6, %%mm1\n" // blue=blue/64
146 "punpckldq %%mm1,%%mm1\n" // 00 u1 00 u0 00 u1 00 u0 166 // create G (result in mm7)
147 "psubw _MMX_0080w,%%mm1\n" // mm1-128:u1 u1 u0 u0 u1 u1 u0 u0 167 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1
148 // create Cb_g (result in mm5) 168 "movq %%mm4,%%mm7\n" // 0 L2 0 L0 0 l2 0 l1
149 "movq %%mm1,%%mm5\n" // u1 u1 u0 u0 u1 u1 u0 u0 169 "paddsw %%mm5, %%mm3\n" // lum1+Cb_g:x G3t x G1t x g3t x g1t
150 "pmullw _MMX_UgrnRGB,%%mm5\n" // blue*-109dec=1.7129*64 170 "paddsw %%mm5, %%mm7\n" // lum1+Cb_g:x G2t x G0t x g2t x g0t
151 "pmullw _MMX_UbluRGB,%%mm1\n" // blue*114dec=1.78125*64 171 "paddsw %%mm0, %%mm3\n" // lum1+Cr_g:x G3 x G1 x g3 x g1
152 "psraw $6, %%mm5\n" // blue=red/64 172 "paddsw %%mm0, %%mm7\n" // lum1+blue:x G2 x G0 x g2 x g0
153 "psraw $6, %%mm1\n" // blue=blue/64 173 "packuswb %%mm3,%%mm3\n" // G3 G1 g3 g1 G3 G1 g3 g1
154 174 "packuswb %%mm7,%%mm7\n" // G2 G0 g2 g0 G2 G0 g2 g0
155 // create G (result in mm7) 175 "punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0
156 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 176 // create B (result in mm5)
157 "movq %%mm4,%%mm7\n" // 0 L2 0 L0 0 l2 0 l1 177 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1
158 "paddsw %%mm5, %%mm3\n" // lum1+Cb_g:x G3t x G1t x g3t x g1t 178 "movq %%mm4,%%mm5\n" // 0 L2 0 L0 0 l2 0 l1
159 "paddsw %%mm5, %%mm7\n" // lum1+Cb_g:x G2t x G0t x g2t x g0t 179 "paddsw %%mm1, %%mm3\n" // lum1+blue:x B3 x B1 x b3 x b1
160 "paddsw %%mm0, %%mm3\n" // lum1+Cr_g:x G3 x G1 x g3 x g1 180 "paddsw %%mm1, %%mm5\n" // lum1+blue:x B2 x B0 x b2 x b0
161 "paddsw %%mm0, %%mm7\n" // lum1+blue:x G2 x G0 x g2 x g0 181 "packuswb %%mm3,%%mm3\n" // B3 B1 b3 b1 B3 B1 b3 b1
162 "packuswb %%mm3,%%mm3\n" // G3 G1 g3 g1 G3 G1 g3 g1 182 "packuswb %%mm5,%%mm5\n" // B2 B0 b2 b0 B2 B0 b2 b0
163 "packuswb %%mm7,%%mm7\n" // G2 G0 g2 g0 G2 G0 g2 g0 183 "punpcklbw %%mm3,%%mm5\n" // B3 B2 B1 B0 b3 b2 b1 b0
164 "punpcklbw %%mm3,%%mm7\n" // G3 G2 G1 G0 g3 g2 g1 g0 184 // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
165 185 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
166 // create B (result in mm5) 186 "pxor %%mm4,%%mm4\n" // 0 0 0 0 0 0 0 0
167 "movq %%mm2,%%mm3\n" // 0 L3 0 L1 0 l3 0 l1 187 "movq %%mm6,%%mm1\n" // R3 R2 R1 R0 r3 r2 r1 r0
168 "movq %%mm4,%%mm5\n" // 0 L2 0 L0 0 l2 0 l1 188 "movq %%mm5,%%mm3\n" // B3 B2 B1 B0 b3 b2 b1 b0
169 "paddsw %%mm1, %%mm3\n" // lum1+blue:x B3 x B1 x b3 x b1 189 // process lower lum
170 "paddsw %%mm1, %%mm5\n" // lum1+blue:x B2 x B0 x b2 x b0 190 "punpcklbw %%mm4,%%mm1\n" // 0 r3 0 r2 0 r1 0 r0
171 "packuswb %%mm3,%%mm3\n" // B3 B1 b3 b1 B3 B1 b3 b1 191 "punpcklbw %%mm4,%%mm3\n" // 0 b3 0 b2 0 b1 0 b0
172 "packuswb %%mm5,%%mm5\n" // B2 B0 b2 b0 B2 B0 b2 b0 192 "movq %%mm1,%%mm2\n" // 0 r3 0 r2 0 r1 0 r0
173 "punpcklbw %%mm3,%%mm5\n" // B3 B2 B1 B0 b3 b2 b1 b0 193 "movq %%mm3,%%mm0\n" // 0 b3 0 b2 0 b1 0 b0
174 194 "punpcklwd %%mm1,%%mm3\n" // 0 r1 0 b1 0 r0 0 b0
175 // fill destination row1 (needed are mm6=Rr,mm7=Gg,mm5=Bb) 195 "punpckhwd %%mm2,%%mm0\n" // 0 r3 0 b3 0 r2 0 b2
176 196 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
177 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 197 "movq %%mm7,%%mm1\n" // G3 G2 G1 G0 g3 g2 g1 g0
178 "pxor %%mm4,%%mm4\n" // 0 0 0 0 0 0 0 0 198 "punpcklbw %%mm1,%%mm2\n" // g3 0 g2 0 g1 0 g0 0
179 "movq %%mm6,%%mm1\n" // R3 R2 R1 R0 r3 r2 r1 r0 199 "punpcklwd %%mm4,%%mm2\n" // 0 0 g1 0 0 0 g0 0
180 "movq %%mm5,%%mm3\n" // B3 B2 B1 B0 b3 b2 b1 b0 200 "por %%mm3, %%mm2\n" // 0 r1 g1 b1 0 r0 g0 b0
181 // process lower lum 201 "movq %%mm2,(%3)\n" // wrote out ! row1
182 "punpcklbw %%mm4,%%mm1\n" // 0 r3 0 r2 0 r1 0 r0 202 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
183 "punpcklbw %%mm4,%%mm3\n" // 0 b3 0 b2 0 b1 0 b0 203 "punpcklbw %%mm1,%%mm4\n" // g3 0 g2 0 g1 0 g0 0
184 "movq %%mm1,%%mm2\n" // 0 r3 0 r2 0 r1 0 r0 204 "punpckhwd %%mm2,%%mm4\n" // 0 0 g3 0 0 0 g2 0
185 "movq %%mm3,%%mm0\n" // 0 b3 0 b2 0 b1 0 b0 205 "por %%mm0, %%mm4\n" // 0 r3 g3 b3 0 r2 g2 b2
186 "punpcklwd %%mm1,%%mm3\n" // 0 r1 0 b1 0 r0 0 b0 206 "movq %%mm4,8(%3)\n" // wrote out ! row1
187 "punpckhwd %%mm2,%%mm0\n" // 0 r3 0 b3 0 r2 0 b2 207 // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb)
188 208 // this can be done "destructive"
189 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 209 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0
190 "movq %%mm7,%%mm1\n" // G3 G2 G1 G0 g3 g2 g1 g0 210 "punpckhbw %%mm2,%%mm6\n" // 0 R3 0 R2 0 R1 0 R0
191 "punpcklbw %%mm1,%%mm2\n" // g3 0 g2 0 g1 0 g0 0 211 "punpckhbw %%mm1,%%mm5\n" // G3 B3 G2 B2 G1 B1 G0 B0
192 "punpcklwd %%mm4,%%mm2\n" // 0 0 g1 0 0 0 g0 0 212 "movq %%mm5,%%mm1\n" // G3 B3 G2 B2 G1 B1 G0 B0
193 "por %%mm3, %%mm2\n" // 0 r1 g1 b1 0 r0 g0 b0 213 "punpcklwd %%mm6,%%mm1\n" // 0 R1 G1 B1 0 R0 G0 B0
194 "movq %%mm2,(%3)\n" // wrote out ! row1 214 "movq %%mm1,(%5)\n" // wrote out ! row2
195 215 "punpckhwd %%mm6,%%mm5\n" // 0 R3 G3 B3 0 R2 G2 B2
196 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 216 "movq %%mm5,8(%5)\n" // wrote out ! row2
197 "punpcklbw %%mm1,%%mm4\n" // g3 0 g2 0 g1 0 g0 0 217 "addl $4,%2\n" // lum+4
198 "punpckhwd %%mm2,%%mm4\n" // 0 0 g3 0 0 0 g2 0 218 "leal 16(%3),%3\n" // row1+16
199 "por %%mm0, %%mm4\n" // 0 r3 g3 b3 0 r2 g2 b2 219 "leal 16(%5),%5\n" // row2+16
200 "movq %%mm4,8(%3)\n" // wrote out ! row1 220 "addl $2, %%ebx\n" // cr+2
201 221 "addl $2, %1\n" // cb+2
202 // fill destination row2 (needed are mm6=Rr,mm7=Gg,mm5=Bb) 222 "addl $4,%6\n" // x+4
203 // this can be done "destructive" 223 "cmpl %4,%6\n" "jl 1b\n" "addl %4, %2\n" // lum += cols
204 "pxor %%mm2,%%mm2\n" // 0 0 0 0 0 0 0 0 224 "addl %8, %3\n" // row1+= mod
205 "punpckhbw %%mm2,%%mm6\n" // 0 R3 0 R2 0 R1 0 R0 225 "addl %8, %5\n" // row2+= mod
206 "punpckhbw %%mm1,%%mm5\n" // G3 B3 G2 B2 G1 B1 G0 B0 226 "movl $0, %6\n" // x=0
207 "movq %%mm5,%%mm1\n" // G3 B3 G2 B2 G1 B1 G0 B0 227 "cmpl %7, %2\n"
208 "punpcklwd %%mm6,%%mm1\n" // 0 R1 G1 B1 0 R0 G0 B0 228 "jl 1b\n"
209 "movq %%mm1,(%5)\n" // wrote out ! row2 229 "emms\n"
210 "punpckhwd %%mm6,%%mm5\n" // 0 R3 G3 B3 0 R2 G2 B2 230 "popl %%ebx\n"::"m" (cr), "r" (cb), "r" (lum),
211 "movq %%mm5,8(%5)\n" // wrote out ! row2 231 "r" (row1), "r" (cols), "r" (row2), "m" (x),
212 232 "m" (y), "m" (mod));
213 "addl $4,%2\n" // lum+4
214 "leal 16(%3),%3\n" // row1+16
215 "leal 16(%5),%5\n" // row2+16
216 "addl $2, %%ebx\n" // cr+2
217 "addl $2, %1\n" // cb+2
218
219 "addl $4,%6\n" // x+4
220 "cmpl %4,%6\n"
221
222 "jl 1b\n"
223 "addl %4, %2\n" // lum += cols
224 "addl %8, %3\n" // row1+= mod
225 "addl %8, %5\n" // row2+= mod
226 "movl $0, %6\n" // x=0
227 "cmpl %7, %2\n"
228 "jl 1b\n"
229 "emms\n"
230 "popl %%ebx\n"
231 :
232 : "m" (cr), "r"(cb),"r"(lum),
233 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod));
234 } 233 }
235 234
236 void Color565DitherYV12MMX1X( int *colortab, Uint32 *rgb_2_pix, 235 void
237 unsigned char *lum, unsigned char *cr, 236 Color565DitherYV12MMX1X (int *colortab, Uint32 * rgb_2_pix,
238 unsigned char *cb, unsigned char *out, 237 unsigned char *lum, unsigned char *cr,
239 int rows, int cols, int mod ) 238 unsigned char *cb, unsigned char *out,
239 int rows, int cols, int mod)
240 { 240 {
241 Uint16 *row1; 241 Uint16 *row1;
242 Uint16 *row2; 242 Uint16 *row2;
243 243
244 unsigned char* y = lum +cols*rows; /* Pointer to the end */ 244 unsigned char *y = lum + cols * rows; /* Pointer to the end */
245 int x=0; 245 int x = 0;
246 row1 = (Uint16 *)out; /* 16 bit target */ 246 row1 = (Uint16 *) out; /* 16 bit target */
247 row2 = (Uint16 *)out+cols+mod; /* start of second row */ 247 row2 = (Uint16 *) out + cols + mod; /* start of second row */
248 mod = (mod+cols+mod)*2; /* increment for row1 in byte */ 248 mod = (mod + cols + mod) * 2; /* increment for row1 in byte */
249 249
250 250
251 __asm__ __volatile__( 251 __asm__ __volatile__ ("pushl %%ebx\n" "movl %0, %%ebx\n" ".align 8\n" "1:\n" "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0
252 "pushl %%ebx\n" 252 "pxor %%mm7, %%mm7\n" "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0
253 "movl %0, %%ebx\n" 253 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0
254 254 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0
255 ".align 8\n" 255 "psubw _MMX_0080w, %%mm0\n" "psubw _MMX_0080w, %%mm1\n" "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0
256 "1:\n" 256 "movq %%mm1, %%mm3\n" // Cr
257 "movd (%1), %%mm0\n" // 4 Cb 0 0 0 0 u3 u2 u1 u0 257 "pmullw _MMX_Ugrn565, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0
258 "pxor %%mm7, %%mm7\n" 258 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0
259 "movd (%%ebx), %%mm1\n" // 4 Cr 0 0 0 0 v3 v2 v1 v0 259 "pmullw _MMX_Ublu5x5, %%mm0\n" // Cb2blue
260 "punpcklbw %%mm7, %%mm0\n" // 4 W cb 0 u3 0 u2 0 u1 0 u0 260 "pand _MMX_00FFw, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0
261 "punpcklbw %%mm7, %%mm1\n" // 4 W cr 0 v3 0 v2 0 v1 0 v0 261 "pmullw _MMX_Vgrn565, %%mm3\n" // Cr2green
262 "psubw _MMX_0080w, %%mm0\n" 262 "movq (%2), %%mm7\n" // L2
263 "psubw _MMX_0080w, %%mm1\n" 263 "pmullw _MMX_Vred5x5, %%mm1\n" // Cr2red
264 "movq %%mm0, %%mm2\n" // Cb 0 u3 0 u2 0 u1 0 u0 264 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1
265 "movq %%mm1, %%mm3\n" // Cr 265 "pmullw _MMX_Ycoeff, %%mm6\n" // lum1
266 "pmullw _MMX_Ugrn565, %%mm2\n" // Cb2green 0 R3 0 R2 0 R1 0 R0 266 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green
267 "movq (%2), %%mm6\n" // L1 l7 L6 L5 L4 L3 L2 L1 L0 267 "pmullw _MMX_Ycoeff, %%mm7\n" // lum2
268 "pmullw _MMX_Ublu5x5, %%mm0\n" // Cb2blue 268 "movq %%mm6, %%mm4\n" // lum1
269 "pand _MMX_00FFw, %%mm6\n" // L1 00 L6 00 L4 00 L2 00 L0 269 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0
270 "pmullw _MMX_Vgrn565, %%mm3\n" // Cr2green 270 "movq %%mm4, %%mm5\n" // lum1
271 "movq (%2), %%mm7\n" // L2 271 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0
272 "pmullw _MMX_Vred5x5, %%mm1\n" // Cr2red 272 "paddw %%mm2, %%mm5\n" // lum1 +green 00 G6 00 G4 00 G2 00 G0
273 "psrlw $8, %%mm7\n" // L2 00 L7 00 L5 00 L3 00 L1 273 "psraw $6, %%mm4\n" // R1 0 .. 64
274 "pmullw _MMX_Ycoeff, %%mm6\n" // lum1 274 "movq %%mm7, %%mm3\n" // lum2 00 L7 00 L5 00 L3 00 L1
275 "paddw %%mm3, %%mm2\n" // Cb2green + Cr2green == green 275 "psraw $6, %%mm5\n" // G1 - .. +
276 "pmullw _MMX_Ycoeff, %%mm7\n" // lum2 276 "paddw %%mm0, %%mm7\n" // Lum2 +blue 00 B7 00 B5 00 B3 00 B1
277 277 "psraw $6, %%mm6\n" // B1 0 .. 64
278 "movq %%mm6, %%mm4\n" // lum1 278 "packuswb %%mm4, %%mm4\n" // R1 R1
279 "paddw %%mm0, %%mm6\n" // lum1 +blue 00 B6 00 B4 00 B2 00 B0 279 "packuswb %%mm5, %%mm5\n" // G1 G1
280 "movq %%mm4, %%mm5\n" // lum1 280 "packuswb %%mm6, %%mm6\n" // B1 B1
281 "paddw %%mm1, %%mm4\n" // lum1 +red 00 R6 00 R4 00 R2 00 R0 281 "punpcklbw %%mm4, %%mm4\n" "punpcklbw %%mm5, %%mm5\n" "pand _MMX_red565, %%mm4\n" "psllw $3, %%mm5\n" // GREEN 1
282 "paddw %%mm2, %%mm5\n" // lum1 +green 00 G6 00 G4 00 G2 00 G0 282 "punpcklbw %%mm6, %%mm6\n" "pand _MMX_grn565, %%mm5\n" "pand _MMX_red565, %%mm6\n" "por %%mm5, %%mm4\n" //
283 "psraw $6, %%mm4\n" // R1 0 .. 64 283 "psrlw $11, %%mm6\n" // BLUE 1
284 "movq %%mm7, %%mm3\n" // lum2 00 L7 00 L5 00 L3 00 L1 284 "movq %%mm3, %%mm5\n" // lum2
285 "psraw $6, %%mm5\n" // G1 - .. + 285 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1
286 "paddw %%mm0, %%mm7\n" // Lum2 +blue 00 B7 00 B5 00 B3 00 B1 286 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1
287 "psraw $6, %%mm6\n" // B1 0 .. 64 287 "psraw $6, %%mm3\n" // R2
288 "packuswb %%mm4, %%mm4\n" // R1 R1 288 "por %%mm6, %%mm4\n" // MM4
289 "packuswb %%mm5, %%mm5\n" // G1 G1 289 "psraw $6, %%mm5\n" // G2
290 "packuswb %%mm6, %%mm6\n" // B1 B1 290 "movq (%2, %4), %%mm6\n" // L3 load lum2
291 "punpcklbw %%mm4, %%mm4\n" 291 "psraw $6, %%mm7\n" "packuswb %%mm3, %%mm3\n" "packuswb %%mm5, %%mm5\n" "packuswb %%mm7, %%mm7\n" "pand _MMX_00FFw, %%mm6\n" // L3
292 "punpcklbw %%mm5, %%mm5\n" 292 "punpcklbw %%mm3, %%mm3\n" "punpcklbw %%mm5, %%mm5\n" "pmullw _MMX_Ycoeff, %%mm6\n" // lum3
293 293 "punpcklbw %%mm7, %%mm7\n" "psllw $3, %%mm5\n" // GREEN 2
294 "pand _MMX_red565, %%mm4\n" 294 "pand _MMX_red565, %%mm7\n" "pand _MMX_red565, %%mm3\n" "psrlw $11, %%mm7\n" // BLUE 2
295 "psllw $3, %%mm5\n" // GREEN 1 295 "pand _MMX_grn565, %%mm5\n" "por %%mm7, %%mm3\n" "movq (%2,%4), %%mm7\n" // L4 load lum2
296 "punpcklbw %%mm6, %%mm6\n" 296 "por %%mm5, %%mm3\n" //
297 "pand _MMX_grn565, %%mm5\n" 297 "psrlw $8, %%mm7\n" // L4
298 "pand _MMX_red565, %%mm6\n" 298 "movq %%mm4, %%mm5\n" "punpcklwd %%mm3, %%mm4\n" "pmullw _MMX_Ycoeff, %%mm7\n" // lum4
299 "por %%mm5, %%mm4\n" // 299 "punpckhwd %%mm3, %%mm5\n" "movq %%mm4, (%3)\n" // write row1
300 "psrlw $11, %%mm6\n" // BLUE 1 300 "movq %%mm5, 8(%3)\n" // write row1
301 "movq %%mm3, %%mm5\n" // lum2 301 "movq %%mm6, %%mm4\n" // Lum3
302 "paddw %%mm1, %%mm3\n" // lum2 +red 00 R7 00 R5 00 R3 00 R1 302 "paddw %%mm0, %%mm6\n" // Lum3 +blue
303 "paddw %%mm2, %%mm5\n" // lum2 +green 00 G7 00 G5 00 G3 00 G1 303 "movq %%mm4, %%mm5\n" // Lum3
304 "psraw $6, %%mm3\n" // R2 304 "paddw %%mm1, %%mm4\n" // Lum3 +red
305 "por %%mm6, %%mm4\n" // MM4 305 "paddw %%mm2, %%mm5\n" // Lum3 +green
306 "psraw $6, %%mm5\n" // G2 306 "psraw $6, %%mm4\n" "movq %%mm7, %%mm3\n" // Lum4
307 "movq (%2, %4), %%mm6\n" // L3 load lum2 307 "psraw $6, %%mm5\n" "paddw %%mm0, %%mm7\n" // Lum4 +blue
308 "psraw $6, %%mm7\n" 308 "psraw $6, %%mm6\n" // Lum3 +blue
309 "packuswb %%mm3, %%mm3\n" 309 "movq %%mm3, %%mm0\n" // Lum4
310 "packuswb %%mm5, %%mm5\n" 310 "packuswb %%mm4, %%mm4\n" "paddw %%mm1, %%mm3\n" // Lum4 +red
311 "packuswb %%mm7, %%mm7\n" 311 "packuswb %%mm5, %%mm5\n" "paddw %%mm2, %%mm0\n" // Lum4 +green
312 "pand _MMX_00FFw, %%mm6\n" // L3 312 "packuswb %%mm6, %%mm6\n" "punpcklbw %%mm4, %%mm4\n" "punpcklbw %%mm5, %%mm5\n" "punpcklbw %%mm6, %%mm6\n" "psllw $3, %%mm5\n" // GREEN 3
313 "punpcklbw %%mm3, %%mm3\n" 313 "pand _MMX_red565, %%mm4\n" "psraw $6, %%mm3\n" // psr 6
314 "punpcklbw %%mm5, %%mm5\n" 314 "psraw $6, %%mm0\n" "pand _MMX_red565, %%mm6\n" // BLUE
315 "pmullw _MMX_Ycoeff, %%mm6\n" // lum3 315 "pand _MMX_grn565, %%mm5\n" "psrlw $11, %%mm6\n" // BLUE 3
316 "punpcklbw %%mm7, %%mm7\n" 316 "por %%mm5, %%mm4\n" "psraw $6, %%mm7\n" "por %%mm6, %%mm4\n" "packuswb %%mm3, %%mm3\n" "packuswb %%mm0, %%mm0\n" "packuswb %%mm7, %%mm7\n" "punpcklbw %%mm3, %%mm3\n" "punpcklbw %%mm0, %%mm0\n" "punpcklbw %%mm7, %%mm7\n" "pand _MMX_red565, %%mm3\n" "pand _MMX_red565, %%mm7\n" // BLUE
317 "psllw $3, %%mm5\n" // GREEN 2 317 "psllw $3, %%mm0\n" // GREEN 4
318 "pand _MMX_red565, %%mm7\n" 318 "psrlw $11, %%mm7\n" "pand _MMX_grn565, %%mm0\n" "por %%mm7, %%mm3\n" "por %%mm0, %%mm3\n" "movq %%mm4, %%mm5\n" "punpcklwd %%mm3, %%mm4\n" "punpckhwd %%mm3, %%mm5\n" "movq %%mm4, (%5)\n" "movq %%mm5, 8(%5)\n" "addl $8, %6\n" "addl $8, %2\n" "addl $4, %%ebx\n" "addl $4, %1\n" "cmpl %4, %6\n" "leal 16(%3), %3\n" "leal 16(%5),%5\n" // row2+16
319 "pand _MMX_red565, %%mm3\n" 319 "jl 1b\n" "addl %4, %2\n" // lum += cols
320 "psrlw $11, %%mm7\n" // BLUE 2 320 "addl %8, %3\n" // row1+= mod
321 "pand _MMX_grn565, %%mm5\n" 321 "addl %8, %5\n" // row2+= mod
322 "por %%mm7, %%mm3\n" 322 "movl $0, %6\n" // x=0
323 "movq (%2,%4), %%mm7\n" // L4 load lum2 323 "cmpl %7, %2\n"
324 "por %%mm5, %%mm3\n" // 324 "jl 1b\n"
325 "psrlw $8, %%mm7\n" // L4 325 "emms\n"
326 "movq %%mm4, %%mm5\n" 326 "popl %%ebx\n"::"m" (cr), "r" (cb), "r" (lum),
327 "punpcklwd %%mm3, %%mm4\n" 327 "r" (row1), "r" (cols), "r" (row2), "m" (x),
328 "pmullw _MMX_Ycoeff, %%mm7\n" // lum4 328 "m" (y), "m" (mod));
329 "punpckhwd %%mm3, %%mm5\n"
330
331 "movq %%mm4, (%3)\n" // write row1
332 "movq %%mm5, 8(%3)\n" // write row1
333
334 "movq %%mm6, %%mm4\n" // Lum3
335 "paddw %%mm0, %%mm6\n" // Lum3 +blue
336
337 "movq %%mm4, %%mm5\n" // Lum3
338 "paddw %%mm1, %%mm4\n" // Lum3 +red
339 "paddw %%mm2, %%mm5\n" // Lum3 +green
340 "psraw $6, %%mm4\n"
341 "movq %%mm7, %%mm3\n" // Lum4
342 "psraw $6, %%mm5\n"
343 "paddw %%mm0, %%mm7\n" // Lum4 +blue
344 "psraw $6, %%mm6\n" // Lum3 +blue
345 "movq %%mm3, %%mm0\n" // Lum4
346 "packuswb %%mm4, %%mm4\n"
347 "paddw %%mm1, %%mm3\n" // Lum4 +red
348 "packuswb %%mm5, %%mm5\n"
349 "paddw %%mm2, %%mm0\n" // Lum4 +green
350 "packuswb %%mm6, %%mm6\n"
351 "punpcklbw %%mm4, %%mm4\n"
352 "punpcklbw %%mm5, %%mm5\n"
353 "punpcklbw %%mm6, %%mm6\n"
354 "psllw $3, %%mm5\n" // GREEN 3
355 "pand _MMX_red565, %%mm4\n"
356 "psraw $6, %%mm3\n" // psr 6
357 "psraw $6, %%mm0\n"
358 "pand _MMX_red565, %%mm6\n" // BLUE
359 "pand _MMX_grn565, %%mm5\n"
360 "psrlw $11, %%mm6\n" // BLUE 3
361 "por %%mm5, %%mm4\n"
362 "psraw $6, %%mm7\n"
363 "por %%mm6, %%mm4\n"
364 "packuswb %%mm3, %%mm3\n"
365 "packuswb %%mm0, %%mm0\n"
366 "packuswb %%mm7, %%mm7\n"
367 "punpcklbw %%mm3, %%mm3\n"
368 "punpcklbw %%mm0, %%mm0\n"
369 "punpcklbw %%mm7, %%mm7\n"
370 "pand _MMX_red565, %%mm3\n"
371 "pand _MMX_red565, %%mm7\n" // BLUE
372 "psllw $3, %%mm0\n" // GREEN 4
373 "psrlw $11, %%mm7\n"
374 "pand _MMX_grn565, %%mm0\n"
375 "por %%mm7, %%mm3\n"
376 "por %%mm0, %%mm3\n"
377
378 "movq %%mm4, %%mm5\n"
379
380 "punpcklwd %%mm3, %%mm4\n"
381 "punpckhwd %%mm3, %%mm5\n"
382
383 "movq %%mm4, (%5)\n"
384 "movq %%mm5, 8(%5)\n"
385
386 "addl $8, %6\n"
387 "addl $8, %2\n"
388 "addl $4, %%ebx\n"
389 "addl $4, %1\n"
390 "cmpl %4, %6\n"
391 "leal 16(%3), %3\n"
392 "leal 16(%5),%5\n" // row2+16
393
394
395 "jl 1b\n"
396 "addl %4, %2\n" // lum += cols
397 "addl %8, %3\n" // row1+= mod
398 "addl %8, %5\n" // row2+= mod
399 "movl $0, %6\n" // x=0
400 "cmpl %7, %2\n"
401 "jl 1b\n"
402 "emms\n"
403 "popl %%ebx\n"
404 :
405 :"m" (cr), "r"(cb),"r"(lum),
406 "r"(row1),"r"(cols),"r"(row2),"m"(x),"m"(y),"m"(mod));
407 } 329 }
408 330
409 #endif /* GCC i386 inline assembly */ 331 #endif /* GCC i386 inline assembly */
410 #endif /* 0 */ 332 #endif /* 0 */
333 /* vi: set ts=4 sw=4 expandtab: */