Mercurial > pylearn
annotate pylearn/io/filetensor.py @ 1412:e1b5092b4a53
allow to read gzip filetensor when there is more then 1 tensor in the file.
author | Frederic Bastien <nouiz@nouiz.org> |
---|---|
date | Wed, 02 Feb 2011 14:33:01 -0500 |
parents | e06c0ff46d2a |
children | a6e634b83d88 |
rev | line source |
---|---|
33 | 1 """ |
2 Read and write the matrix file format described at | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
3 U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} |
33 | 4 |
5 The format is for dense tensors: | |
6 | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
7 - magic number indicating type and endianness - 4bytes |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
8 - rank of tensor - int32 |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
9 - dimensions - int32, int32, int32, ... |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
10 - <data> |
33 | 11 |
12 The number of dimensions and rank is slightly tricky: | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
13 - for scalar: rank=0, dimensions = [1, 1, 1] |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
14 - for vector: rank=1, dimensions = [?, 1, 1] |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
15 - for matrix: rank=2, dimensions = [?, ?, 1] |
33 | 16 |
17 For rank >= 3, the number of dimensions matches the rank exactly. | |
18 | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
19 |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
20 @todo: add complex type support |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
21 |
33 | 22 """ |
1401
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
23 import gzip |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
24 |
33 | 25 import numpy |
26 | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
27 def _prod(lst): |
33 | 28 p = 1 |
29 for l in lst: | |
30 p *= l | |
31 return p | |
32 | |
33 _magic_dtype = { | |
34 0x1E3D4C51 : ('float32', 4), | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
35 #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? |
33 | 36 0x1E3D4C53 : ('float64', 8), |
37 0x1E3D4C54 : ('int32', 4), | |
35 | 38 0x1E3D4C55 : ('uint8', 1), |
33 | 39 0x1E3D4C56 : ('int16', 2), |
40 } | |
41 _dtype_magic = { | |
42 'float32': 0x1E3D4C51, | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
43 #'packed matrix': 0x1E3D4C52, |
33 | 44 'float64': 0x1E3D4C53, |
45 'int32': 0x1E3D4C54, | |
35 | 46 'uint8': 0x1E3D4C55, |
33 | 47 'int16': 0x1E3D4C56 |
48 } | |
49 | |
693
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
50 def _read_int32(f): |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
51 """unpack a 4-byte integer from the current position in file f""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
52 s = f.read(4) |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
53 s_array = numpy.fromstring(s, dtype='int32') |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
54 return s_array.item() |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
55 |
1401
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
56 def _read_header(f, debug=False, fromgzip=None): |
693
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
57 """ |
1401
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
58 :param f: an open file handle. |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
59 :type f: a file or gzip.GzipFile object |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
60 |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
61 :param fromgzip: bool or None |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
62 :type fromgzip: if None determine the type of file handle. |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
63 |
693
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
64 :returns: data type, element size, rank, shape, size |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
65 """ |
1401
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
66 if fromgzip is None: |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
67 fromgzip = isinstance(f, gzip.GzipFile) |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
68 |
693
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
69 #what is the data type of this matrix? |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
70 #magic_s = f.read(4) |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
71 #magic = numpy.fromstring(magic_s, dtype='int32') |
1394
48e8292e22e7
new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents:
1393
diff
changeset
|
72 magic = _read_int32(f) |
693
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
73 magic_t, elsize = _magic_dtype[magic] |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
74 if debug: |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
75 print 'header magic', magic, magic_t, elsize |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
76 if magic_t == 'packed matrix': |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
77 raise NotImplementedError('packed matrix not supported') |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
78 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
79 #what is the rank of the tensor? |
1394
48e8292e22e7
new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents:
1393
diff
changeset
|
80 ndim = _read_int32(f) |
693
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
81 if debug: print 'header ndim', ndim |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
82 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
83 #what are the dimensions of the tensor? |
1394
48e8292e22e7
new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents:
1393
diff
changeset
|
84 if fromgzip: |
48e8292e22e7
new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents:
1393
diff
changeset
|
85 d = f.read(max(ndim,3)*4) |
48e8292e22e7
new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents:
1393
diff
changeset
|
86 dim = numpy.fromstring(d, dtype='int32')[:ndim] |
1393
8ecc6da87350
allow to read header of dataset from a string.
Frederic Bastien <nouiz@nouiz.org>
parents:
894
diff
changeset
|
87 else: |
8ecc6da87350
allow to read header of dataset from a string.
Frederic Bastien <nouiz@nouiz.org>
parents:
894
diff
changeset
|
88 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] |
693
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
89 dim_size = _prod(dim) |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
90 if debug: print 'header dim', dim, dim_size |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
91 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
92 return magic_t, elsize, ndim, dim, dim_size |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
93 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
94 class arraylike(object): |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
95 """Provide an array-like interface to the filetensor in f. |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
96 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
97 The rank parameter to __init__ controls how this object interprets the underlying tensor. |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
98 Its behaviour should be clear from the following example. |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
99 Suppose the underlying tensor is MxNxK. |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
100 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
101 - If rank is 0, self[i] will be a scalar and len(self) == M*N*K. |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
102 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
103 - If rank is 1, self[i] is a vector of length K, and len(self) == M*N. |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
104 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
105 - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1. |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
106 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
107 - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1. |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
108 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
109 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
110 :note: Objects of this class generally require exclusive use of the underlying file handle, because |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
111 they call seek() every time you access an element. |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
112 """ |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
113 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
114 f = None |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
115 """File-like object""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
116 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
117 magic_t = None |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
118 """numpy data type of array""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
119 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
120 elsize = None |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
121 """number of bytes per scalar element""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
122 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
123 ndim = None |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
124 """Rank of underlying tensor""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
125 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
126 dim = None |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
127 """tuple of array dimensions (aka shape)""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
128 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
129 dim_size = None |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
130 """number of scalars in the tensor (prod of dim)""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
131 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
132 f_start = None |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
133 """The file position of the first element of the tensor""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
134 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
135 readshape = None |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
136 """tuple of array dimensions of the block that we read""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
137 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
138 readsize = None |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
139 """number of elements we must read for each block""" |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
140 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
141 def __init__(self, f, rank=0, debug=False): |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
142 self.f = f |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
143 self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug) |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
144 self.f_start = f.tell() |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
145 |
818 | 146 if rank <= self.ndim: |
147 self.readshape = tuple(self.dim[self.ndim-rank:]) | |
148 else: | |
149 self.readshape = tuple(self.dim) | |
150 | |
151 #self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim) | |
152 | |
153 if rank <= self.ndim: | |
154 padding = tuple() | |
155 else: | |
156 padding = (1,) * (rank - self.ndim) | |
157 | |
158 #padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim) | |
693
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
159 self.returnshape = padding + self.readshape |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
160 self.readsize = _prod(self.readshape) |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
161 if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
162 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
163 def __len__(self): |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
164 return _prod(self.dim[:self.ndim-len(self.readshape)]) |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
165 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
166 def __getitem__(self, idx): |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
167 if idx >= len(self): |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
168 raise IndexError(idx) |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
169 self.f.seek(self.f_start + idx * self.elsize * self.readsize) |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
170 return numpy.fromfile(self.f, |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
171 dtype=self.magic_t, |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
172 count=self.readsize).reshape(self.returnshape) |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
173 |
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
174 |
33 | 175 # |
176 # TODO: implement item selection: | |
177 # e.g. load('some mat', subtensor=(:6, 2:5)) | |
178 # | |
179 # This function should be memory efficient by: | |
180 # - allocating an output matrix at the beginning | |
181 # - seeking through the file, reading subtensors from multiple places | |
182 def read(f, subtensor=None, debug=False): | |
183 """Load all or part of file 'f' into a numpy ndarray | |
184 | |
420
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
185 @param f: file from which to read |
1401
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
186 @type f: file-like object. Can be a gzip open file. |
33 | 187 |
188 If subtensor is not None, it should be like the argument to | |
189 numpy.ndarray.__getitem__. The following two expressions should return | |
190 equivalent ndarray objects, but the one on the left may be faster and more | |
191 memory efficient if the underlying file f is big. | |
192 | |
193 read(f, subtensor) <===> read(f)[*subtensor] | |
194 | |
195 Support for subtensors is currently spotty, so check the code to see if your | |
196 particular type of subtensor is supported. | |
197 | |
198 """ | |
693
ee7026de9681
added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
537
diff
changeset
|
199 magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug) |
894
032911ac4941
Add option to use subtensor start in pylearn.io.filetensor
Arnaud Bergeron <abergeron@gmail.com>
parents:
818
diff
changeset
|
200 f_start = f.tell() |
33 | 201 |
202 rval = None | |
1401
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
203 if isinstance(f, gzip.GzipFile): |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
204 assert subtensor is None, "Not implemented the subtensor case for gzip file" |
1412
e1b5092b4a53
allow to read gzip filetensor when there is more then 1 tensor in the file.
Frederic Bastien <nouiz@nouiz.org>
parents:
1401
diff
changeset
|
205 d = f.read(_prod(dim)*elsize) |
e1b5092b4a53
allow to read gzip filetensor when there is more then 1 tensor in the file.
Frederic Bastien <nouiz@nouiz.org>
parents:
1401
diff
changeset
|
206 rval = numpy.fromstring(d, dtype=magic_t).reshape(dim) |
1401
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
207 del d |
e06c0ff46d2a
allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents:
1394
diff
changeset
|
208 elif subtensor is None: |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
209 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) |
33 | 210 elif isinstance(subtensor, slice): |
211 if subtensor.step not in (None, 1): | |
212 raise NotImplementedError('slice with step', subtensor.step) | |
213 if subtensor.start not in (None, 0): | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
214 bytes_per_row = _prod(dim[1:]) * elsize |
894
032911ac4941
Add option to use subtensor start in pylearn.io.filetensor
Arnaud Bergeron <abergeron@gmail.com>
parents:
818
diff
changeset
|
215 f.seek(f_start+subtensor.start * bytes_per_row) |
032911ac4941
Add option to use subtensor start in pylearn.io.filetensor
Arnaud Bergeron <abergeron@gmail.com>
parents:
818
diff
changeset
|
216 dim[0] = min(dim[0], subtensor.stop) - subtensor.start |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
217 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) |
33 | 218 else: |
219 raise NotImplementedError('subtensor access not written yet:', subtensor) | |
220 | |
221 return rval | |
222 | |
223 def write(f, mat): | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
224 """Write a numpy.ndarray to file. |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
225 |
420
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
226 @param f: file into which to write |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
227 @type f: file-like object |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
228 |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
229 @param mat: array to write to file |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
230 @type mat: numpy ndarray or compatible |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
231 |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
232 """ |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
233 def _write_int32(f, i): |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
234 i_array = numpy.asarray(i, dtype='int32') |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
235 if 0: print 'writing int32', i, i_array |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
236 i_array.tofile(f) |
33 | 237 |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
238 try: |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
239 _write_int32(f, _dtype_magic[str(mat.dtype)]) |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
240 except KeyError: |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
241 raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype) |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
242 |
33 | 243 _write_int32(f, len(mat.shape)) |
244 shape = mat.shape | |
245 if len(shape) < 3: | |
246 shape = list(shape) + [1] * (3 - len(shape)) | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
247 if 0: print 'writing shape =', shape |
33 | 248 for sh in shape: |
249 _write_int32(f, sh) | |
250 mat.tofile(f) | |
251 |