comparison data_generation/transformations/filetensor.py @ 167:1f5937e9e530

More moves - transformations into data_generation, added "deep" folder
author Dumitru Erhan <dumitru.erhan@gmail.com>
date Fri, 26 Feb 2010 14:15:38 -0500
parents transformations/filetensor.py@faacc76d21c2
children
comparison
equal deleted inserted replaced
166:17ae5a1a4dd1 167:1f5937e9e530
1 """
2 Read and write the matrix file format described at
3 U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html}
4
5 The format is for dense tensors:
6
7 - magic number indicating type and endianness - 4bytes
8 - rank of tensor - int32
9 - dimensions - int32, int32, int32, ...
10 - <data>
11
12 The number of dimensions and rank is slightly tricky:
13 - for scalar: rank=0, dimensions = [1, 1, 1]
14 - for vector: rank=1, dimensions = [?, 1, 1]
15 - for matrix: rank=2, dimensions = [?, ?, 1]
16
17 For rank >= 3, the number of dimensions matches the rank exactly.
18
19
20 @todo: add complex type support
21
22 """
23 import sys
24 import numpy
25
26 def _prod(lst):
27 p = 1
28 for l in lst:
29 p *= l
30 return p
31
32 _magic_dtype = {
33 0x1E3D4C51 : ('float32', 4),
34 #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix?
35 0x1E3D4C53 : ('float64', 8),
36 0x1E3D4C54 : ('int32', 4),
37 0x1E3D4C55 : ('uint8', 1),
38 0x1E3D4C56 : ('int16', 2),
39 }
40 _dtype_magic = {
41 'float32': 0x1E3D4C51,
42 #'packed matrix': 0x1E3D4C52,
43 'float64': 0x1E3D4C53,
44 'int32': 0x1E3D4C54,
45 'uint8': 0x1E3D4C55,
46 'int16': 0x1E3D4C56
47 }
48
49 def _read_int32(f):
50 """unpack a 4-byte integer from the current position in file f"""
51 s = f.read(4)
52 s_array = numpy.fromstring(s, dtype='int32')
53 return s_array.item()
54
55 def _read_header(f, debug=False):
56 """
57 :returns: data type, element size, rank, shape, size
58 """
59 #what is the data type of this matrix?
60 #magic_s = f.read(4)
61 #magic = numpy.fromstring(magic_s, dtype='int32')
62 magic = _read_int32(f)
63 magic_t, elsize = _magic_dtype[magic]
64 if debug:
65 print 'header magic', magic, magic_t, elsize
66 if magic_t == 'packed matrix':
67 raise NotImplementedError('packed matrix not supported')
68
69 #what is the rank of the tensor?
70 ndim = _read_int32(f)
71 if debug: print 'header ndim', ndim
72
73 #what are the dimensions of the tensor?
74 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
75 dim_size = _prod(dim)
76 if debug: print 'header dim', dim, dim_size
77
78 return magic_t, elsize, ndim, dim, dim_size
79
80 class arraylike(object):
81 """Provide an array-like interface to the filetensor in f.
82
83 The rank parameter to __init__ controls how this object interprets the underlying tensor.
84 Its behaviour should be clear from the following example.
85 Suppose the underlying tensor is MxNxK.
86
87 - If rank is 0, self[i] will be a scalar and len(self) == M*N*K.
88
89 - If rank is 1, self[i] is a vector of length K, and len(self) == M*N.
90
91 - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1.
92
93 - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1.
94
95
96 :note: Objects of this class generally require exclusive use of the underlying file handle, because
97 they call seek() every time you access an element.
98 """
99
100 f = None
101 """File-like object"""
102
103 magic_t = None
104 """numpy data type of array"""
105
106 elsize = None
107 """number of bytes per scalar element"""
108
109 ndim = None
110 """Rank of underlying tensor"""
111
112 dim = None
113 """tuple of array dimensions (aka shape)"""
114
115 dim_size = None
116 """number of scalars in the tensor (prod of dim)"""
117
118 f_start = None
119 """The file position of the first element of the tensor"""
120
121 readshape = None
122 """tuple of array dimensions of the block that we read"""
123
124 readsize = None
125 """number of elements we must read for each block"""
126
127 def __init__(self, f, rank=0, debug=False):
128 self.f = f
129 self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug)
130 self.f_start = f.tell()
131
132 if rank <= self.ndim:
133 self.readshape = tuple(self.dim[self.ndim-rank:])
134 else:
135 self.readshape = tuple(self.dim)
136
137 #self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim)
138
139 if rank <= self.ndim:
140 padding = tuple()
141 else:
142 padding = (1,) * (rank - self.ndim)
143
144 #padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim)
145 self.returnshape = padding + self.readshape
146 self.readsize = _prod(self.readshape)
147 if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize
148
149 def __len__(self):
150 return _prod(self.dim[:self.ndim-len(self.readshape)])
151
152 def __getitem__(self, idx):
153 if idx >= len(self):
154 raise IndexError(idx)
155 self.f.seek(self.f_start + idx * self.elsize * self.readsize)
156 return numpy.fromfile(self.f,
157 dtype=self.magic_t,
158 count=self.readsize).reshape(self.returnshape)
159
160
161 #
162 # TODO: implement item selection:
163 # e.g. load('some mat', subtensor=(:6, 2:5))
164 #
165 # This function should be memory efficient by:
166 # - allocating an output matrix at the beginning
167 # - seeking through the file, reading subtensors from multiple places
168 def read(f, subtensor=None, debug=False):
169 """Load all or part of file 'f' into a numpy ndarray
170
171 @param f: file from which to read
172 @type f: file-like object
173
174 If subtensor is not None, it should be like the argument to
175 numpy.ndarray.__getitem__. The following two expressions should return
176 equivalent ndarray objects, but the one on the left may be faster and more
177 memory efficient if the underlying file f is big.
178
179 read(f, subtensor) <===> read(f)[*subtensor]
180
181 Support for subtensors is currently spotty, so check the code to see if your
182 particular type of subtensor is supported.
183
184 """
185 magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug)
186 f_start = f.tell()
187
188 rval = None
189 if subtensor is None:
190 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim)
191 elif isinstance(subtensor, slice):
192 if subtensor.step not in (None, 1):
193 raise NotImplementedError('slice with step', subtensor.step)
194 if subtensor.start not in (None, 0):
195 bytes_per_row = _prod(dim[1:]) * elsize
196 f.seek(f_start + subtensor.start * bytes_per_row)
197 dim[0] = min(dim[0], subtensor.stop) - subtensor.start
198 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim)
199 else:
200 raise NotImplementedError('subtensor access not written yet:', subtensor)
201
202 return rval
203
204 def write(f, mat):
205 """Write a numpy.ndarray to file.
206
207 @param f: file into which to write
208 @type f: file-like object
209
210 @param mat: array to write to file
211 @type mat: numpy ndarray or compatible
212
213 """
214 def _write_int32(f, i):
215 i_array = numpy.asarray(i, dtype='int32')
216 if 0: print 'writing int32', i, i_array
217 i_array.tofile(f)
218
219 try:
220 _write_int32(f, _dtype_magic[str(mat.dtype)])
221 except KeyError:
222 raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype)
223
224 _write_int32(f, len(mat.shape))
225 shape = mat.shape
226 if len(shape) < 3:
227 shape = list(shape) + [1] * (3 - len(shape))
228 if 0: print 'writing shape =', shape
229 for sh in shape:
230 _write_int32(f, sh)
231 mat.tofile(f)
232