Mercurial > ift6266
comparison data_generation/transformations/filetensor.py @ 167:1f5937e9e530
More moves - transformations into data_generation, added "deep" folder
author | Dumitru Erhan <dumitru.erhan@gmail.com> |
---|---|
date | Fri, 26 Feb 2010 14:15:38 -0500 |
parents | transformations/filetensor.py@faacc76d21c2 |
children |
comparison
equal
deleted
inserted
replaced
166:17ae5a1a4dd1 | 167:1f5937e9e530 |
---|---|
1 """ | |
2 Read and write the matrix file format described at | |
3 U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} | |
4 | |
5 The format is for dense tensors: | |
6 | |
7 - magic number indicating type and endianness - 4bytes | |
8 - rank of tensor - int32 | |
9 - dimensions - int32, int32, int32, ... | |
10 - <data> | |
11 | |
12 The number of dimensions and rank is slightly tricky: | |
13 - for scalar: rank=0, dimensions = [1, 1, 1] | |
14 - for vector: rank=1, dimensions = [?, 1, 1] | |
15 - for matrix: rank=2, dimensions = [?, ?, 1] | |
16 | |
17 For rank >= 3, the number of dimensions matches the rank exactly. | |
18 | |
19 | |
20 @todo: add complex type support | |
21 | |
22 """ | |
23 import sys | |
24 import numpy | |
25 | |
26 def _prod(lst): | |
27 p = 1 | |
28 for l in lst: | |
29 p *= l | |
30 return p | |
31 | |
32 _magic_dtype = { | |
33 0x1E3D4C51 : ('float32', 4), | |
34 #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? | |
35 0x1E3D4C53 : ('float64', 8), | |
36 0x1E3D4C54 : ('int32', 4), | |
37 0x1E3D4C55 : ('uint8', 1), | |
38 0x1E3D4C56 : ('int16', 2), | |
39 } | |
40 _dtype_magic = { | |
41 'float32': 0x1E3D4C51, | |
42 #'packed matrix': 0x1E3D4C52, | |
43 'float64': 0x1E3D4C53, | |
44 'int32': 0x1E3D4C54, | |
45 'uint8': 0x1E3D4C55, | |
46 'int16': 0x1E3D4C56 | |
47 } | |
48 | |
49 def _read_int32(f): | |
50 """unpack a 4-byte integer from the current position in file f""" | |
51 s = f.read(4) | |
52 s_array = numpy.fromstring(s, dtype='int32') | |
53 return s_array.item() | |
54 | |
55 def _read_header(f, debug=False): | |
56 """ | |
57 :returns: data type, element size, rank, shape, size | |
58 """ | |
59 #what is the data type of this matrix? | |
60 #magic_s = f.read(4) | |
61 #magic = numpy.fromstring(magic_s, dtype='int32') | |
62 magic = _read_int32(f) | |
63 magic_t, elsize = _magic_dtype[magic] | |
64 if debug: | |
65 print 'header magic', magic, magic_t, elsize | |
66 if magic_t == 'packed matrix': | |
67 raise NotImplementedError('packed matrix not supported') | |
68 | |
69 #what is the rank of the tensor? | |
70 ndim = _read_int32(f) | |
71 if debug: print 'header ndim', ndim | |
72 | |
73 #what are the dimensions of the tensor? | |
74 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] | |
75 dim_size = _prod(dim) | |
76 if debug: print 'header dim', dim, dim_size | |
77 | |
78 return magic_t, elsize, ndim, dim, dim_size | |
79 | |
80 class arraylike(object): | |
81 """Provide an array-like interface to the filetensor in f. | |
82 | |
83 The rank parameter to __init__ controls how this object interprets the underlying tensor. | |
84 Its behaviour should be clear from the following example. | |
85 Suppose the underlying tensor is MxNxK. | |
86 | |
87 - If rank is 0, self[i] will be a scalar and len(self) == M*N*K. | |
88 | |
89 - If rank is 1, self[i] is a vector of length K, and len(self) == M*N. | |
90 | |
91 - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1. | |
92 | |
93 - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1. | |
94 | |
95 | |
96 :note: Objects of this class generally require exclusive use of the underlying file handle, because | |
97 they call seek() every time you access an element. | |
98 """ | |
99 | |
100 f = None | |
101 """File-like object""" | |
102 | |
103 magic_t = None | |
104 """numpy data type of array""" | |
105 | |
106 elsize = None | |
107 """number of bytes per scalar element""" | |
108 | |
109 ndim = None | |
110 """Rank of underlying tensor""" | |
111 | |
112 dim = None | |
113 """tuple of array dimensions (aka shape)""" | |
114 | |
115 dim_size = None | |
116 """number of scalars in the tensor (prod of dim)""" | |
117 | |
118 f_start = None | |
119 """The file position of the first element of the tensor""" | |
120 | |
121 readshape = None | |
122 """tuple of array dimensions of the block that we read""" | |
123 | |
124 readsize = None | |
125 """number of elements we must read for each block""" | |
126 | |
127 def __init__(self, f, rank=0, debug=False): | |
128 self.f = f | |
129 self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug) | |
130 self.f_start = f.tell() | |
131 | |
132 if rank <= self.ndim: | |
133 self.readshape = tuple(self.dim[self.ndim-rank:]) | |
134 else: | |
135 self.readshape = tuple(self.dim) | |
136 | |
137 #self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim) | |
138 | |
139 if rank <= self.ndim: | |
140 padding = tuple() | |
141 else: | |
142 padding = (1,) * (rank - self.ndim) | |
143 | |
144 #padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim) | |
145 self.returnshape = padding + self.readshape | |
146 self.readsize = _prod(self.readshape) | |
147 if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize | |
148 | |
149 def __len__(self): | |
150 return _prod(self.dim[:self.ndim-len(self.readshape)]) | |
151 | |
152 def __getitem__(self, idx): | |
153 if idx >= len(self): | |
154 raise IndexError(idx) | |
155 self.f.seek(self.f_start + idx * self.elsize * self.readsize) | |
156 return numpy.fromfile(self.f, | |
157 dtype=self.magic_t, | |
158 count=self.readsize).reshape(self.returnshape) | |
159 | |
160 | |
161 # | |
162 # TODO: implement item selection: | |
163 # e.g. load('some mat', subtensor=(:6, 2:5)) | |
164 # | |
165 # This function should be memory efficient by: | |
166 # - allocating an output matrix at the beginning | |
167 # - seeking through the file, reading subtensors from multiple places | |
168 def read(f, subtensor=None, debug=False): | |
169 """Load all or part of file 'f' into a numpy ndarray | |
170 | |
171 @param f: file from which to read | |
172 @type f: file-like object | |
173 | |
174 If subtensor is not None, it should be like the argument to | |
175 numpy.ndarray.__getitem__. The following two expressions should return | |
176 equivalent ndarray objects, but the one on the left may be faster and more | |
177 memory efficient if the underlying file f is big. | |
178 | |
179 read(f, subtensor) <===> read(f)[*subtensor] | |
180 | |
181 Support for subtensors is currently spotty, so check the code to see if your | |
182 particular type of subtensor is supported. | |
183 | |
184 """ | |
185 magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug) | |
186 f_start = f.tell() | |
187 | |
188 rval = None | |
189 if subtensor is None: | |
190 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) | |
191 elif isinstance(subtensor, slice): | |
192 if subtensor.step not in (None, 1): | |
193 raise NotImplementedError('slice with step', subtensor.step) | |
194 if subtensor.start not in (None, 0): | |
195 bytes_per_row = _prod(dim[1:]) * elsize | |
196 f.seek(f_start + subtensor.start * bytes_per_row) | |
197 dim[0] = min(dim[0], subtensor.stop) - subtensor.start | |
198 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) | |
199 else: | |
200 raise NotImplementedError('subtensor access not written yet:', subtensor) | |
201 | |
202 return rval | |
203 | |
204 def write(f, mat): | |
205 """Write a numpy.ndarray to file. | |
206 | |
207 @param f: file into which to write | |
208 @type f: file-like object | |
209 | |
210 @param mat: array to write to file | |
211 @type mat: numpy ndarray or compatible | |
212 | |
213 """ | |
214 def _write_int32(f, i): | |
215 i_array = numpy.asarray(i, dtype='int32') | |
216 if 0: print 'writing int32', i, i_array | |
217 i_array.tofile(f) | |
218 | |
219 try: | |
220 _write_int32(f, _dtype_magic[str(mat.dtype)]) | |
221 except KeyError: | |
222 raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype) | |
223 | |
224 _write_int32(f, len(mat.shape)) | |
225 shape = mat.shape | |
226 if len(shape) < 3: | |
227 shape = list(shape) + [1] * (3 - len(shape)) | |
228 if 0: print 'writing shape =', shape | |
229 for sh in shape: | |
230 _write_int32(f, sh) | |
231 mat.tofile(f) | |
232 |