comparison src/video/SDL_blit_copy.c @ 2247:93994f65c74c

Removed hermes since it's LGPL and not compatible with a commercial license. Prepping for using MMX and SSE intrinsics instead of inline assembly. .. except for memcpy equivalents which only get faster if they can exploit the parallelism of loading into multiple SIMD registers. :)
author Sam Lantinga <slouken@libsdl.org>
date Wed, 15 Aug 2007 08:21:10 +0000
parents
children 5cd2a2293cf0
comparison
equal deleted inserted replaced
2246:75daa0792bd1 2247:93994f65c74c
1 /*
2 SDL - Simple DirectMedia Layer
3 Copyright (C) 1997-2006 Sam Lantinga
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with this library; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18
19 Sam Lantinga
20 slouken@libsdl.org
21 */
22 #include "SDL_config.h"
23
24 #include "SDL_video.h"
25 #include "SDL_blit.h"
26
27 /* The MMX/SSE intrinsics don't give access to specific registers for
28 the most memory parallelism, so we'll use GCC inline assembly here...
29 */
30 #ifndef __GNUC__
31 #undef __MMX__
32 #undef __SSE__
33 #endif
34
35 #ifdef __MMX__
36 static __inline__ void
37 SDL_memcpyMMX(Uint8 *dst, const Uint8 *src, int len)
38 {
39 int i;
40
41 for (i = len / 64; i--;) {
42 __asm__ __volatile__ (
43 "prefetchnta (%0)\n"
44 "movq (%0), %%mm0\n"
45 "movq 8(%0), %%mm1\n"
46 "movq 16(%0), %%mm2\n"
47 "movq 24(%0), %%mm3\n"
48 "movq 32(%0), %%mm4\n"
49 "movq 40(%0), %%mm5\n"
50 "movq 48(%0), %%mm6\n"
51 "movq 56(%0), %%mm7\n"
52 "movntq %%mm0, (%1)\n"
53 "movntq %%mm1, 8(%1)\n"
54 "movntq %%mm2, 16(%1)\n"
55 "movntq %%mm3, 24(%1)\n"
56 "movntq %%mm4, 32(%1)\n"
57 "movntq %%mm5, 40(%1)\n"
58 "movntq %%mm6, 48(%1)\n"
59 "movntq %%mm7, 56(%1)\n"
60 :: "r" (src), "r" (dst) : "memory");
61 src += 64;
62 dst += 64;
63 }
64 if (len & 63)
65 SDL_memcpy(dst, src, len & 63);
66 }
67 #endif /* __MMX__ */
68
69 #ifdef __SSE__
70 static __inline__ void
71 SDL_memcpySSE(Uint8 *dst, const Uint8 *src, int len)
72 {
73 int i;
74
75 for (i = len / 64; i--;) {
76 __asm__ __volatile__ (
77 "prefetchnta (%0)\n"
78 "movaps (%0), %%xmm0\n"
79 "movaps 16(%0), %%xmm1\n"
80 "movaps 32(%0), %%xmm2\n"
81 "movaps 48(%0), %%xmm3\n"
82 "movntps %%xmm0, (%1)\n"
83 "movntps %%xmm1, 16(%1)\n"
84 "movntps %%xmm2, 32(%1)\n"
85 "movntps %%xmm3, 48(%1)\n"
86 :: "r" (src), "r" (dst) : "memory");
87 src += 64;
88 dst += 64;
89 }
90 if (len & 63)
91 SDL_memcpy(dst, src, len & 63);
92 }
93 #endif /* __SSE__ */
94
95 void
96 SDL_BlitCopy(SDL_BlitInfo * info)
97 {
98 Uint8 *src, *dst;
99 int w, h;
100 int srcskip, dstskip;
101
102 w = info->d_width * info->dst->BytesPerPixel;
103 h = info->d_height;
104 src = info->s_pixels;
105 dst = info->d_pixels;
106 srcskip = w + info->s_skip;
107 dstskip = w + info->d_skip;
108
109 #ifdef __SSE__
110 if (SDL_HasSSE() && !((uintptr_t)src & 15) && !((uintptr_t)dst & 15)) {
111 while (h--) {
112 SDL_memcpySSE(dst, src, w);
113 src += srcskip;
114 dst += dstskip;
115 }
116 return;
117 }
118 #endif
119
120 #ifdef __MMX__
121 if (SDL_HasMMX() && !((uintptr_t)src & 7) && !((uintptr_t)dst & 7)) {
122 while (h--) {
123 SDL_memcpyMMX(dst, src, w);
124 src += srcskip;
125 dst += dstskip;
126 }
127 __asm__ __volatile__(" emms\n"::);
128 return;
129 }
130 #endif
131
132 while (h--) {
133 SDL_memcpy(dst, src, w);
134 src += srcskip;
135 dst += dstskip;
136 }
137 }
138
139 void
140 SDL_BlitCopyOverlap(SDL_BlitInfo * info)
141 {
142 Uint8 *src, *dst;
143 int w, h;
144 int skip;
145
146 w = info->d_width * info->dst->BytesPerPixel;
147 h = info->d_height;
148 src = info->s_pixels;
149 dst = info->d_pixels;
150 skip = w + info->s_skip;
151 if ((dst < src) || (dst >= (src + h*skip))) {
152 SDL_BlitCopy(info);
153 } else {
154 src += ((h - 1) * skip);
155 dst += ((h - 1) * skip);
156 while (h--) {
157 SDL_revcpy(dst, src, w);
158 src -= skip;
159 dst -= skip;
160 }
161 }
162 }
163
164 /* vi: set ts=4 sw=4 expandtab: */