annotate filetensor.py @ 35:2508c373cf29

bug fix for reading byte matrices
author bergstrj@iro.umontreal.ca
date Fri, 18 Apr 2008 01:36:56 -0400
parents bb92087cb0f6
children 2b6656b2ef52
rev   line source
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
1 """
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
2 Read and write the matrix file format described at
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
3 http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
4
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
5 The format is for dense tensors:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
6
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
7 magic number indicating type and endianness - 4bytes
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
8 rank of tensor - int32
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
9 dimensions - int32, int32, int32, ...
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
10 <data>
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
11
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
12 The number of dimensions and rank is slightly tricky:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
13 for scalar: rank=0, dimensions = [1, 1, 1]
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
14 for vector: rank=1, dimensions = [?, 1, 1]
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
15 for matrix: rank=2, dimensions = [?, ?, 1]
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
16
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
17 For rank >= 3, the number of dimensions matches the rank exactly.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
18
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
19 """
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
20 import sys
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
21 import numpy
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
22
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
23 def prod(lst):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
24 p = 1
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
25 for l in lst:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
26 p *= l
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
27 return p
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
28
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
29 _magic_dtype = {
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
30 0x1E3D4C51 : ('float32', 4),
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
31 0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix?
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
32 0x1E3D4C53 : ('float64', 8),
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
33 0x1E3D4C54 : ('int32', 4),
35
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
34 0x1E3D4C55 : ('uint8', 1),
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
35 0x1E3D4C56 : ('int16', 2),
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
36 }
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
37 _dtype_magic = {
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
38 'float32': 0x1E3D4C51,
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
39 'packed matrix': 0x1E3D4C52,
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
40 'float64': 0x1E3D4C53,
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
41 'int32': 0x1E3D4C54,
35
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
42 'uint8': 0x1E3D4C55,
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
43 'int16': 0x1E3D4C56
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
44 }
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
45
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
46 def _unused():
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
47 f.seek(0,2) #seek to end
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
48 f_len = f.tell()
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
49 f.seek(f_data_start,0) #seek back to where we were
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
50
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
51 if debug: print 'length:', f_len
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
52
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
53
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
54 f_data_bytes = (f_len - f_data_start)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
55
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
56 if debug: print 'data bytes according to header: ', dim_size * elsize
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
57 if debug: print 'data bytes according to file : ', f_data_bytes
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
58
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
59 if debug: print 'reading data...'
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
60 sys.stdout.flush()
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
61
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
62 def _write_int32(f, i):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
63 i_array = numpy.asarray(i, dtype='int32')
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
64 if 0: print 'writing int32', i, i_array
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
65 i_array.tofile(f)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
66 def _read_int32(f):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
67 s = f.read(4)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
68 s_array = numpy.fromstring(s, dtype='int32')
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
69 return s_array.item()
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
70
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
71 def read_ndarray(f, dim, dtype):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
72 return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
73
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
74 #
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
75 # TODO: implement item selection:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
76 # e.g. load('some mat', subtensor=(:6, 2:5))
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
77 #
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
78 # This function should be memory efficient by:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
79 # - allocating an output matrix at the beginning
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
80 # - seeking through the file, reading subtensors from multiple places
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
81 def read(f, subtensor=None, debug=False):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
82 """Load all or part of file 'f' into a numpy ndarray
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
83
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
84 If f is a string, it will be treated as a filename, and opened in read mode.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
85
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
86 If subtensor is not None, it should be like the argument to
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
87 numpy.ndarray.__getitem__. The following two expressions should return
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
88 equivalent ndarray objects, but the one on the left may be faster and more
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
89 memory efficient if the underlying file f is big.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
90
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
91 read(f, subtensor) <===> read(f)[*subtensor]
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
92
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
93 Support for subtensors is currently spotty, so check the code to see if your
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
94 particular type of subtensor is supported.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
95
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
96 """
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
97
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
98 if isinstance(f, str):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
99 if debug: print 'f', f
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
100 f = file(f, 'r')
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
101
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
102 #what is the data type of this matrix?
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
103 #magic_s = f.read(4)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
104 #magic = numpy.fromstring(magic_s, dtype='int32')
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
105 magic = _read_int32(f)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
106 magic_t, elsize = _magic_dtype[magic]
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
107 if debug:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
108 print 'header magic', magic, magic_t, elsize
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
109 if magic_t == 'packed matrix':
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
110 raise NotImplementedError('packed matrix not supported')
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
111
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
112 #what is the rank of the tensor?
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
113 ndim = _read_int32(f)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
114 if debug: print 'header ndim', ndim
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
115
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
116 #what are the dimensions of the tensor?
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
117 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
118 dim_size = prod(dim)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
119 if debug: print 'header dim', dim, dim_size
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
120
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
121 rval = None
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
122 if subtensor is None:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
123 rval = read_ndarray(f, dim, magic_t)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
124 elif isinstance(subtensor, slice):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
125 if subtensor.step not in (None, 1):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
126 raise NotImplementedError('slice with step', subtensor.step)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
127 if subtensor.start not in (None, 0):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
128 bytes_per_row = prod(dim[1:]) * elsize
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
129 raise NotImplementedError('slice with start', subtensor.start)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
130 dim[0] = min(dim[0], subtensor.stop)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
131 rval = read_ndarray(f, dim, magic_t)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
132 else:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
133 raise NotImplementedError('subtensor access not written yet:', subtensor)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
134
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
135 return rval
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
136
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
137 def write(f, mat):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
138 if isinstance(f, str):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
139 f = file(f, 'w')
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
140
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
141 _write_int32(f, _dtype_magic[str(mat.dtype)])
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
142 _write_int32(f, len(mat.shape))
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
143 shape = mat.shape
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
144 if len(shape) < 3:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
145 shape = list(shape) + [1] * (3 - len(shape))
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
146 print 'writing shape =', shape
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
147 for sh in shape:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
148 _write_int32(f, sh)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
149 mat.tofile(f)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
150
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
151 if __name__ == '__main__':
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
152 #a small test script, starts by reading sys.argv[1]
35
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
153 rval = read(sys.argv[1], None, debug=True) #load from filename
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
154 print 'rval', rval.shape, rval.size
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
155
35
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
156 if 0:
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
157 f = file('/tmp/some_mat', 'w');
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
158 write(f, rval)
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
159 print ''
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
160 f.close()
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
161 f = file('/tmp/some_mat', 'r');
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
162 rval2 = read(f) #load from file handle
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
163 print 'rval2', rval2.shape, rval2.size
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
164
35
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
165 assert rval.dtype == rval2.dtype
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
166 assert rval.shape == rval2.shape
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
167 assert numpy.all(rval == rval2)
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
168 print 'ok'
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
169