121
|
1 # This library is free software; you can redistribute it and/or
|
|
2 # modify it under the terms of the GNU Lesser General Public
|
|
3 # License as published by the Free Software Foundation; either
|
|
4 # version 2.1 of the License, or (at your option) any later version.
|
|
5 #
|
|
6 # This library is distributed in the hope that it will be useful,
|
|
7 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
8 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
9 # Lesser General Public License for more details.
|
|
10 #
|
|
11 # You should have received a copy of the GNU Lesser General Public
|
|
12 # License along with this library; if not, write to the
|
|
13 # Free Software Foundation, Inc.,
|
|
14 # 59 Temple Place, Suite 330,
|
|
15 # Boston, MA 02111-1307 USA
|
|
16
|
|
17 # This file is part of urlgrabber, a high-level cross-protocol url-grabber
|
|
18 # Copyright 2002-2004 Michael D. Stenner, Ryan Tomayko
|
|
19
|
|
20 # $Id: byterange.py,v 1.9 2005/02/14 21:55:07 mstenner Exp $
|
|
21
|
|
22 import os
|
|
23 import stat
|
|
24 import urllib
|
|
25 import urllib2
|
|
26 import email.Utils
|
|
27
|
|
28 try:
|
|
29 from cStringIO import StringIO
|
|
30 except ImportError, msg:
|
|
31 from StringIO import StringIO
|
|
32
|
|
33 class RangeError(IOError):
|
|
34 """Error raised when an unsatisfiable range is requested."""
|
|
35 pass
|
|
36
|
|
37 class HTTPRangeHandler(urllib2.BaseHandler):
|
|
38 """Handler that enables HTTP Range headers.
|
|
39
|
|
40 This was extremely simple. The Range header is a HTTP feature to
|
|
41 begin with so all this class does is tell urllib2 that the
|
|
42 "206 Partial Content" reponse from the HTTP server is what we
|
|
43 expected.
|
|
44
|
|
45 Example:
|
|
46 import urllib2
|
|
47 import byterange
|
|
48
|
|
49 range_handler = range.HTTPRangeHandler()
|
|
50 opener = urllib2.build_opener(range_handler)
|
|
51
|
|
52 # install it
|
|
53 urllib2.install_opener(opener)
|
|
54
|
|
55 # create Request and set Range header
|
|
56 req = urllib2.Request('http://www.python.org/')
|
|
57 req.header['Range'] = 'bytes=30-50'
|
|
58 f = urllib2.urlopen(req)
|
|
59 """
|
|
60
|
|
61 def http_error_206(self, req, fp, code, msg, hdrs):
|
|
62 # 206 Partial Content Response
|
|
63 r = urllib.addinfourl(fp, hdrs, req.get_full_url())
|
|
64 r.code = code
|
|
65 r.msg = msg
|
|
66 return r
|
|
67
|
|
68 def http_error_416(self, req, fp, code, msg, hdrs):
|
|
69 # HTTP's Range Not Satisfiable error
|
|
70 raise RangeError('Requested Range Not Satisfiable')
|
|
71
|
|
72 class RangeableFileObject:
|
|
73 """File object wrapper to enable raw range handling.
|
|
74 This was implemented primarilary for handling range
|
|
75 specifications for file:// urls. This object effectively makes
|
|
76 a file object look like it consists only of a range of bytes in
|
|
77 the stream.
|
|
78
|
|
79 Examples:
|
|
80 # expose 10 bytes, starting at byte position 20, from
|
|
81 # /etc/aliases.
|
|
82 >>> fo = RangeableFileObject(file('/etc/passwd', 'r'), (20,30))
|
|
83 # seek seeks within the range (to position 23 in this case)
|
|
84 >>> fo.seek(3)
|
|
85 # tell tells where your at _within the range_ (position 3 in
|
|
86 # this case)
|
|
87 >>> fo.tell()
|
|
88 # read EOFs if an attempt is made to read past the last
|
|
89 # byte in the range. the following will return only 7 bytes.
|
|
90 >>> fo.read(30)
|
|
91 """
|
|
92
|
|
93 def __init__(self, fo, rangetup):
|
|
94 """Create a RangeableFileObject.
|
|
95 fo -- a file like object. only the read() method need be
|
|
96 supported but supporting an optimized seek() is
|
|
97 preferable.
|
|
98 rangetup -- a (firstbyte,lastbyte) tuple specifying the range
|
|
99 to work over.
|
|
100 The file object provided is assumed to be at byte offset 0.
|
|
101 """
|
|
102 self.fo = fo
|
|
103 (self.firstbyte, self.lastbyte) = range_tuple_normalize(rangetup)
|
|
104 self.realpos = 0
|
|
105 self._do_seek(self.firstbyte)
|
|
106
|
|
107 def __getattr__(self, name):
|
|
108 """This effectively allows us to wrap at the instance level.
|
|
109 Any attribute not found in _this_ object will be searched for
|
|
110 in self.fo. This includes methods."""
|
|
111 if hasattr(self.fo, name):
|
|
112 return getattr(self.fo, name)
|
|
113 raise AttributeError(name)
|
|
114
|
|
115 def tell(self):
|
|
116 """Return the position within the range.
|
|
117 This is different from fo.seek in that position 0 is the
|
|
118 first byte position of the range tuple. For example, if
|
|
119 this object was created with a range tuple of (500,899),
|
|
120 tell() will return 0 when at byte position 500 of the file.
|
|
121 """
|
|
122 return (self.realpos - self.firstbyte)
|
|
123
|
|
124 def seek(self, offset, whence=0):
|
|
125 """Seek within the byte range.
|
|
126 Positioning is identical to that described under tell().
|
|
127 """
|
|
128 assert whence in (0, 1, 2)
|
|
129 if whence == 0: # absolute seek
|
|
130 realoffset = self.firstbyte + offset
|
|
131 elif whence == 1: # relative seek
|
|
132 realoffset = self.realpos + offset
|
|
133 elif whence == 2: # absolute from end of file
|
|
134 # XXX: are we raising the right Error here?
|
|
135 raise IOError('seek from end of file not supported.')
|
|
136
|
|
137 # do not allow seek past lastbyte in range
|
|
138 if self.lastbyte and (realoffset >= self.lastbyte):
|
|
139 realoffset = self.lastbyte
|
|
140
|
|
141 self._do_seek(realoffset - self.realpos)
|
|
142
|
|
143 def read(self, size=-1):
|
|
144 """Read within the range.
|
|
145 This method will limit the size read based on the range.
|
|
146 """
|
|
147 size = self._calc_read_size(size)
|
|
148 rslt = self.fo.read(size)
|
|
149 self.realpos += len(rslt)
|
|
150 return rslt
|
|
151
|
|
152 def readline(self, size=-1):
|
|
153 """Read lines within the range.
|
|
154 This method will limit the size read based on the range.
|
|
155 """
|
|
156 size = self._calc_read_size(size)
|
|
157 rslt = self.fo.readline(size)
|
|
158 self.realpos += len(rslt)
|
|
159 return rslt
|
|
160
|
|
161 def _calc_read_size(self, size):
|
|
162 """Handles calculating the amount of data to read based on
|
|
163 the range.
|
|
164 """
|
|
165 if self.lastbyte:
|
|
166 if size > -1:
|
|
167 if ((self.realpos + size) >= self.lastbyte):
|
|
168 size = (self.lastbyte - self.realpos)
|
|
169 else:
|
|
170 size = (self.lastbyte - self.realpos)
|
|
171 return size
|
|
172
|
|
173 def _do_seek(self, offset):
|
|
174 """Seek based on whether wrapped object supports seek().
|
|
175 offset is relative to the current position (self.realpos).
|
|
176 """
|
|
177 assert offset >= 0
|
|
178 if not hasattr(self.fo, 'seek'):
|
|
179 self._poor_mans_seek(offset)
|
|
180 else:
|
|
181 self.fo.seek(self.realpos + offset)
|
|
182 self.realpos += offset
|
|
183
|
|
184 def _poor_mans_seek(self, offset):
|
|
185 """Seek by calling the wrapped file objects read() method.
|
|
186 This is used for file like objects that do not have native
|
|
187 seek support. The wrapped objects read() method is called
|
|
188 to manually seek to the desired position.
|
|
189 offset -- read this number of bytes from the wrapped
|
|
190 file object.
|
|
191 raise RangeError if we encounter EOF before reaching the
|
|
192 specified offset.
|
|
193 """
|
|
194 pos = 0
|
|
195 bufsize = 1024
|
|
196 while pos < offset:
|
|
197 if (pos + bufsize) > offset:
|
|
198 bufsize = offset - pos
|
|
199 buf = self.fo.read(bufsize)
|
|
200 if len(buf) != bufsize:
|
|
201 raise RangeError('Requested Range Not Satisfiable')
|
|
202 pos += bufsize
|
|
203
|
|
204 class FileRangeHandler(urllib2.FileHandler):
|
|
205 """FileHandler subclass that adds Range support.
|
|
206 This class handles Range headers exactly like an HTTP
|
|
207 server would.
|
|
208 """
|
|
209 def open_local_file(self, req):
|
|
210 import mimetypes
|
|
211 import mimetools
|
|
212 host = req.get_host()
|
|
213 file = req.get_selector()
|
|
214 localfile = urllib.url2pathname(file)
|
|
215 stats = os.stat(localfile)
|
|
216 size = stats[stat.ST_SIZE]
|
|
217 modified = email.Utils.formatdate(stats[stat.ST_MTIME])
|
|
218 mtype = mimetypes.guess_type(file)[0]
|
|
219 if host:
|
|
220 host, port = urllib.splitport(host)
|
|
221 if port or socket.gethostbyname(host) not in self.get_names():
|
|
222 raise urllib2.URLError('file not on local host')
|
|
223 fo = open(localfile,'rb')
|
|
224 brange = req.headers.get('Range', None)
|
|
225 brange = range_header_to_tuple(brange)
|
|
226 assert brange != ()
|
|
227 if brange:
|
|
228 (fb, lb) = brange
|
|
229 if lb == '':
|
|
230 lb = size
|
|
231 if fb < 0 or fb > size or lb > size:
|
|
232 raise RangeError('Requested Range Not Satisfiable')
|
|
233 size = (lb - fb)
|
|
234 fo = RangeableFileObject(fo, (fb, lb))
|
|
235 headers = mimetools.Message(StringIO(
|
|
236 'Content-Type: %s\nContent-Length: %d\nLast-Modified: %s\n' %
|
|
237 (mtype or 'text/plain', size, modified)))
|
|
238 return urllib.addinfourl(fo, headers, 'file:'+file)
|
|
239
|
|
240
|
|
241 # FTP Range Support
|
|
242 # Unfortunately, a large amount of base FTP code had to be copied
|
|
243 # from urllib and urllib2 in order to insert the FTP REST command.
|
|
244 # Code modifications for range support have been commented as
|
|
245 # follows:
|
|
246 # -- range support modifications start/end here
|
|
247
|
|
248 from urllib import splitport, splituser, splitpasswd, splitattr, \
|
|
249 unquote, addclosehook, addinfourl
|
|
250 import ftplib
|
|
251 import socket
|
|
252 import sys
|
|
253 import mimetypes
|
|
254 import mimetools
|
|
255
|
|
256 class FTPRangeHandler(urllib2.FTPHandler):
|
|
257 def ftp_open(self, req):
|
|
258 host = req.get_host()
|
|
259 if not host:
|
|
260 raise IOError('ftp error', 'no host given')
|
|
261 host, port = splitport(host)
|
|
262 if port is None:
|
|
263 port = ftplib.FTP_PORT
|
|
264
|
|
265 # username/password handling
|
|
266 user, host = splituser(host)
|
|
267 if user:
|
|
268 user, passwd = splitpasswd(user)
|
|
269 else:
|
|
270 passwd = None
|
|
271 host = unquote(host)
|
|
272 user = unquote(user or '')
|
|
273 passwd = unquote(passwd or '')
|
|
274
|
|
275 try:
|
|
276 host = socket.gethostbyname(host)
|
|
277 except socket.error, msg:
|
|
278 raise urllib2.URLError(msg)
|
|
279 path, attrs = splitattr(req.get_selector())
|
|
280 dirs = path.split('/')
|
|
281 dirs = map(unquote, dirs)
|
|
282 dirs, file = dirs[:-1], dirs[-1]
|
|
283 if dirs and not dirs[0]:
|
|
284 dirs = dirs[1:]
|
|
285 try:
|
|
286 fw = self.connect_ftp(user, passwd, host, port, dirs)
|
|
287 type = file and 'I' or 'D'
|
|
288 for attr in attrs:
|
|
289 attr, value = splitattr(attr)
|
|
290 if attr.lower() == 'type' and \
|
|
291 value in ('a', 'A', 'i', 'I', 'd', 'D'):
|
|
292 type = value.upper()
|
|
293
|
|
294 # -- range support modifications start here
|
|
295 rest = None
|
|
296 range_tup = range_header_to_tuple(req.headers.get('Range', None))
|
|
297 assert range_tup != ()
|
|
298 if range_tup:
|
|
299 (fb, lb) = range_tup
|
|
300 if fb > 0:
|
|
301 rest = fb
|
|
302 # -- range support modifications end here
|
|
303
|
|
304 fp, retrlen = fw.retrfile(file, type, rest)
|
|
305
|
|
306 # -- range support modifications start here
|
|
307 if range_tup:
|
|
308 (fb, lb) = range_tup
|
|
309 if lb == '':
|
|
310 if retrlen is None or retrlen == 0:
|
|
311 raise RangeError('Requested Range Not Satisfiable due to unobtainable file length.')
|
|
312 lb = retrlen
|
|
313 retrlen = lb - fb
|
|
314 if retrlen < 0:
|
|
315 # beginning of range is larger than file
|
|
316 raise RangeError('Requested Range Not Satisfiable')
|
|
317 else:
|
|
318 retrlen = lb - fb
|
|
319 fp = RangeableFileObject(fp, (0, retrlen))
|
|
320 # -- range support modifications end here
|
|
321
|
|
322 headers = ""
|
|
323 mtype = mimetypes.guess_type(req.get_full_url())[0]
|
|
324 if mtype:
|
|
325 headers += "Content-Type: %s\n" % mtype
|
|
326 if retrlen is not None and retrlen >= 0:
|
|
327 headers += "Content-Length: %d\n" % retrlen
|
|
328 sf = StringIO(headers)
|
|
329 headers = mimetools.Message(sf)
|
|
330 return addinfourl(fp, headers, req.get_full_url())
|
|
331 except ftplib.all_errors, msg:
|
|
332 raise IOError('ftp error', msg), sys.exc_info()[2]
|
|
333
|
|
334 def connect_ftp(self, user, passwd, host, port, dirs):
|
|
335 fw = ftpwrapper(user, passwd, host, port, dirs)
|
|
336 return fw
|
|
337
|
|
338 class ftpwrapper(urllib.ftpwrapper):
|
|
339 # range support note:
|
|
340 # this ftpwrapper code is copied directly from
|
|
341 # urllib. The only enhancement is to add the rest
|
|
342 # argument and pass it on to ftp.ntransfercmd
|
|
343 def retrfile(self, file, type, rest=None):
|
|
344 self.endtransfer()
|
|
345 if type in ('d', 'D'):
|
|
346 cmd = 'TYPE A'
|
|
347 isdir = 1
|
|
348 else:
|
|
349 cmd = 'TYPE ' + type
|
|
350 isdir = 0
|
|
351 try:
|
|
352 self.ftp.voidcmd(cmd)
|
|
353 except ftplib.all_errors:
|
|
354 self.init()
|
|
355 self.ftp.voidcmd(cmd)
|
|
356 conn = None
|
|
357 if file and not isdir:
|
|
358 # Use nlst to see if the file exists at all
|
|
359 try:
|
|
360 self.ftp.nlst(file)
|
|
361 except ftplib.error_perm, reason:
|
|
362 raise IOError('ftp error', reason), sys.exc_info()[2]
|
|
363 # Restore the transfer mode!
|
|
364 self.ftp.voidcmd(cmd)
|
|
365 # Try to retrieve as a file
|
|
366 try:
|
|
367 cmd = 'RETR ' + file
|
|
368 conn = self.ftp.ntransfercmd(cmd, rest)
|
|
369 except ftplib.error_perm, reason:
|
|
370 if str(reason).startswith('501'):
|
|
371 # workaround for REST not supported error
|
|
372 fp, retrlen = self.retrfile(file, type)
|
|
373 fp = RangeableFileObject(fp, (rest,''))
|
|
374 return (fp, retrlen)
|
|
375 elif not str(reason).startswith('550'):
|
|
376 raise IOError('ftp error', reason), sys.exc_info()[2]
|
|
377 if not conn:
|
|
378 # Set transfer mode to ASCII!
|
|
379 self.ftp.voidcmd('TYPE A')
|
|
380 # Try a directory listing
|
|
381 if file:
|
|
382 cmd = 'LIST ' + file
|
|
383 else:
|
|
384 cmd = 'LIST'
|
|
385 conn = self.ftp.ntransfercmd(cmd)
|
|
386 self.busy = 1
|
|
387 # Pass back both a suitably decorated object and a retrieval length
|
|
388 return (addclosehook(conn[0].makefile('rb'),
|
|
389 self.endtransfer), conn[1])
|
|
390
|
|
391
|
|
392 ####################################################################
|
|
393 # Range Tuple Functions
|
|
394 # XXX: These range tuple functions might go better in a class.
|
|
395
|
|
396 _rangere = None
|
|
397 def range_header_to_tuple(range_header):
|
|
398 """Get a (firstbyte,lastbyte) tuple from a Range header value.
|
|
399
|
|
400 Range headers have the form "bytes=<firstbyte>-<lastbyte>". This
|
|
401 function pulls the firstbyte and lastbyte values and returns
|
|
402 a (firstbyte,lastbyte) tuple. If lastbyte is not specified in
|
|
403 the header value, it is returned as an empty string in the
|
|
404 tuple.
|
|
405
|
|
406 Return None if range_header is None
|
|
407 Return () if range_header does not conform to the range spec
|
|
408 pattern.
|
|
409
|
|
410 """
|
|
411 global _rangere
|
|
412 if range_header is None:
|
|
413 return None
|
|
414 if _rangere is None:
|
|
415 import re
|
|
416 _rangere = re.compile(r'^bytes=(\d{1,})-(\d*)')
|
|
417 match = _rangere.match(range_header)
|
|
418 if match:
|
|
419 tup = range_tuple_normalize(match.group(1, 2))
|
|
420 if tup and tup[1]:
|
|
421 tup = (tup[0], tup[1]+1)
|
|
422 return tup
|
|
423 return ()
|
|
424
|
|
425 def range_tuple_to_header(range_tup):
|
|
426 """Convert a range tuple to a Range header value.
|
|
427 Return a string of the form "bytes=<firstbyte>-<lastbyte>" or None
|
|
428 if no range is needed.
|
|
429 """
|
|
430 if range_tup is None:
|
|
431 return None
|
|
432 range_tup = range_tuple_normalize(range_tup)
|
|
433 if range_tup:
|
|
434 if range_tup[1]:
|
|
435 range_tup = (range_tup[0], range_tup[1] - 1)
|
|
436 return 'bytes=%s-%s' % range_tup
|
|
437
|
|
438 def range_tuple_normalize(range_tup):
|
|
439 """Normalize a (first_byte,last_byte) range tuple.
|
|
440 Return a tuple whose first element is guaranteed to be an int
|
|
441 and whose second element will be '' (meaning: the last byte) or
|
|
442 an int. Finally, return None if the normalized tuple == (0,'')
|
|
443 as that is equivelant to retrieving the entire file.
|
|
444 """
|
|
445 if range_tup is None:
|
|
446 return None
|
|
447 # handle first byte
|
|
448 fb = range_tup[0]
|
|
449 if fb in (None, ''):
|
|
450 fb = 0
|
|
451 else:
|
|
452 fb = int(fb)
|
|
453 # handle last byte
|
|
454 try:
|
|
455 lb = range_tup[1]
|
|
456 except IndexError:
|
|
457 lb = ''
|
|
458 else:
|
|
459 if lb is None:
|
|
460 lb = ''
|
|
461 elif lb != '':
|
|
462 lb = int(lb)
|
|
463 # check if range is over the entire file
|
|
464 if (fb, lb) == (0, ''):
|
|
465 return None
|
|
466 # check that the range is valid
|
|
467 if lb < fb:
|
|
468 raise RangeError('Invalid byte range: %s-%s' % (fb, lb))
|
|
469 return (fb, lb)
|