33
|
1 """
|
|
2 Read and write the matrix file format described at
|
|
3 http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html
|
|
4
|
|
5 The format is for dense tensors:
|
|
6
|
|
7 magic number indicating type and endianness - 4bytes
|
|
8 rank of tensor - int32
|
|
9 dimensions - int32, int32, int32, ...
|
|
10 <data>
|
|
11
|
|
12 The number of dimensions and rank is slightly tricky:
|
|
13 for scalar: rank=0, dimensions = [1, 1, 1]
|
|
14 for vector: rank=1, dimensions = [?, 1, 1]
|
|
15 for matrix: rank=2, dimensions = [?, ?, 1]
|
|
16
|
|
17 For rank >= 3, the number of dimensions matches the rank exactly.
|
|
18
|
|
19 """
|
|
20 import sys
|
|
21 import numpy
|
|
22
|
|
23 def prod(lst):
|
|
24 p = 1
|
|
25 for l in lst:
|
|
26 p *= l
|
|
27 return p
|
|
28
|
|
29 _magic_dtype = {
|
|
30 0x1E3D4C51 : ('float32', 4),
|
|
31 0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix?
|
|
32 0x1E3D4C53 : ('float64', 8),
|
|
33 0x1E3D4C54 : ('int32', 4),
|
|
34 0x1E3D4C55 : ('int8', 1),
|
|
35 0x1E3D4C56 : ('int16', 2),
|
|
36 }
|
|
37 _dtype_magic = {
|
|
38 'float32': 0x1E3D4C51,
|
|
39 'packed matrix': 0x1E3D4C52,
|
|
40 'float64': 0x1E3D4C53,
|
|
41 'int32': 0x1E3D4C54,
|
|
42 'int8': 0x1E3D4C55,
|
|
43 'int16': 0x1E3D4C56
|
|
44 }
|
|
45
|
|
46 def _unused():
|
|
47 f.seek(0,2) #seek to end
|
|
48 f_len = f.tell()
|
|
49 f.seek(f_data_start,0) #seek back to where we were
|
|
50
|
|
51 if debug: print 'length:', f_len
|
|
52
|
|
53
|
|
54 f_data_bytes = (f_len - f_data_start)
|
|
55
|
|
56 if debug: print 'data bytes according to header: ', dim_size * elsize
|
|
57 if debug: print 'data bytes according to file : ', f_data_bytes
|
|
58
|
|
59 if debug: print 'reading data...'
|
|
60 sys.stdout.flush()
|
|
61
|
|
62 def _write_int32(f, i):
|
|
63 i_array = numpy.asarray(i, dtype='int32')
|
|
64 if 0: print 'writing int32', i, i_array
|
|
65 i_array.tofile(f)
|
|
66 def _read_int32(f):
|
|
67 s = f.read(4)
|
|
68 s_array = numpy.fromstring(s, dtype='int32')
|
|
69 return s_array.item()
|
|
70
|
|
71 def read_ndarray(f, dim, dtype):
|
|
72 return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim)
|
|
73
|
|
74 #
|
|
75 # TODO: implement item selection:
|
|
76 # e.g. load('some mat', subtensor=(:6, 2:5))
|
|
77 #
|
|
78 # This function should be memory efficient by:
|
|
79 # - allocating an output matrix at the beginning
|
|
80 # - seeking through the file, reading subtensors from multiple places
|
|
81 def read(f, subtensor=None, debug=False):
|
|
82 """Load all or part of file 'f' into a numpy ndarray
|
|
83
|
|
84 If f is a string, it will be treated as a filename, and opened in read mode.
|
|
85
|
|
86 If subtensor is not None, it should be like the argument to
|
|
87 numpy.ndarray.__getitem__. The following two expressions should return
|
|
88 equivalent ndarray objects, but the one on the left may be faster and more
|
|
89 memory efficient if the underlying file f is big.
|
|
90
|
|
91 read(f, subtensor) <===> read(f)[*subtensor]
|
|
92
|
|
93 Support for subtensors is currently spotty, so check the code to see if your
|
|
94 particular type of subtensor is supported.
|
|
95
|
|
96 """
|
|
97
|
|
98 if isinstance(f, str):
|
|
99 if debug: print 'f', f
|
|
100 f = file(f, 'r')
|
|
101
|
|
102 #what is the data type of this matrix?
|
|
103 #magic_s = f.read(4)
|
|
104 #magic = numpy.fromstring(magic_s, dtype='int32')
|
|
105 magic = _read_int32(f)
|
|
106 magic_t, elsize = _magic_dtype[magic]
|
|
107 if debug:
|
|
108 print 'header magic', magic, magic_t, elsize
|
|
109 if magic_t == 'packed matrix':
|
|
110 raise NotImplementedError('packed matrix not supported')
|
|
111
|
|
112 #what is the rank of the tensor?
|
|
113 ndim = _read_int32(f)
|
|
114 if debug: print 'header ndim', ndim
|
|
115
|
|
116 #what are the dimensions of the tensor?
|
|
117 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
|
|
118 dim_size = prod(dim)
|
|
119 if debug: print 'header dim', dim, dim_size
|
|
120
|
|
121 rval = None
|
|
122 if subtensor is None:
|
|
123 rval = read_ndarray(f, dim, magic_t)
|
|
124 elif isinstance(subtensor, slice):
|
|
125 if subtensor.step not in (None, 1):
|
|
126 raise NotImplementedError('slice with step', subtensor.step)
|
|
127 if subtensor.start not in (None, 0):
|
|
128 bytes_per_row = prod(dim[1:]) * elsize
|
|
129 raise NotImplementedError('slice with start', subtensor.start)
|
|
130 dim[0] = min(dim[0], subtensor.stop)
|
|
131 rval = read_ndarray(f, dim, magic_t)
|
|
132 else:
|
|
133 raise NotImplementedError('subtensor access not written yet:', subtensor)
|
|
134
|
|
135 return rval
|
|
136
|
|
137 def write(f, mat):
|
|
138 if isinstance(f, str):
|
|
139 f = file(f, 'w')
|
|
140
|
|
141 _write_int32(f, _dtype_magic[str(mat.dtype)])
|
|
142 _write_int32(f, len(mat.shape))
|
|
143 shape = mat.shape
|
|
144 if len(shape) < 3:
|
|
145 shape = list(shape) + [1] * (3 - len(shape))
|
|
146 print 'writing shape =', shape
|
|
147 for sh in shape:
|
|
148 _write_int32(f, sh)
|
|
149 mat.tofile(f)
|
|
150
|
|
151 if __name__ == '__main__':
|
|
152 #a small test script, starts by reading sys.argv[1]
|
|
153 rval = read(sys.argv[1], slice(400), debug=True) #load from filename
|
|
154 print 'rval', rval.shape, rval.size
|
|
155
|
|
156 f = file('/tmp/some_mat', 'w');
|
|
157 write(f, rval)
|
|
158 print ''
|
|
159 f.close()
|
|
160 f = file('/tmp/some_mat', 'r');
|
|
161 rval2 = read(f) #load from file handle
|
|
162 print 'rval2', rval2.shape, rval2.size
|
|
163
|
|
164 assert rval.dtype == rval2.dtype
|
|
165 assert rval.shape == rval2.shape
|
|
166 assert numpy.all(rval == rval2)
|
|
167 print 'ok'
|
|
168
|