Mercurial > pylearn
annotate filetensor.py @ 169:051e07807554
added test for CachedDataSet(ds,cache_all_upon_construction=True)
author | Frederic Bastien <bastienf@iro.umontreal.ca> |
---|---|
date | Tue, 13 May 2008 13:05:45 -0400 |
parents | 2b6656b2ef52 |
children | 82ba488b2c24 |
rev | line source |
---|---|
33 | 1 """ |
2 Read and write the matrix file format described at | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
3 U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} |
33 | 4 |
5 The format is for dense tensors: | |
6 | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
7 - magic number indicating type and endianness - 4bytes |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
8 - rank of tensor - int32 |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
9 - dimensions - int32, int32, int32, ... |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
10 - <data> |
33 | 11 |
12 The number of dimensions and rank is slightly tricky: | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
13 - for scalar: rank=0, dimensions = [1, 1, 1] |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
14 - for vector: rank=1, dimensions = [?, 1, 1] |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
15 - for matrix: rank=2, dimensions = [?, ?, 1] |
33 | 16 |
17 For rank >= 3, the number of dimensions matches the rank exactly. | |
18 | |
19 """ | |
20 import sys | |
21 import numpy | |
22 | |
23 def prod(lst): | |
24 p = 1 | |
25 for l in lst: | |
26 p *= l | |
27 return p | |
28 | |
29 _magic_dtype = { | |
30 0x1E3D4C51 : ('float32', 4), | |
31 0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? | |
32 0x1E3D4C53 : ('float64', 8), | |
33 0x1E3D4C54 : ('int32', 4), | |
35 | 34 0x1E3D4C55 : ('uint8', 1), |
33 | 35 0x1E3D4C56 : ('int16', 2), |
36 } | |
37 _dtype_magic = { | |
38 'float32': 0x1E3D4C51, | |
39 'packed matrix': 0x1E3D4C52, | |
40 'float64': 0x1E3D4C53, | |
41 'int32': 0x1E3D4C54, | |
35 | 42 'uint8': 0x1E3D4C55, |
33 | 43 'int16': 0x1E3D4C56 |
44 } | |
45 | |
46 def _unused(): | |
47 f.seek(0,2) #seek to end | |
48 f_len = f.tell() | |
49 f.seek(f_data_start,0) #seek back to where we were | |
50 | |
51 if debug: print 'length:', f_len | |
52 | |
53 | |
54 f_data_bytes = (f_len - f_data_start) | |
55 | |
56 if debug: print 'data bytes according to header: ', dim_size * elsize | |
57 if debug: print 'data bytes according to file : ', f_data_bytes | |
58 | |
59 if debug: print 'reading data...' | |
60 sys.stdout.flush() | |
61 | |
62 def _write_int32(f, i): | |
63 i_array = numpy.asarray(i, dtype='int32') | |
64 if 0: print 'writing int32', i, i_array | |
65 i_array.tofile(f) | |
66 def _read_int32(f): | |
67 s = f.read(4) | |
68 s_array = numpy.fromstring(s, dtype='int32') | |
69 return s_array.item() | |
70 | |
71 def read_ndarray(f, dim, dtype): | |
72 return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim) | |
73 | |
74 # | |
75 # TODO: implement item selection: | |
76 # e.g. load('some mat', subtensor=(:6, 2:5)) | |
77 # | |
78 # This function should be memory efficient by: | |
79 # - allocating an output matrix at the beginning | |
80 # - seeking through the file, reading subtensors from multiple places | |
81 def read(f, subtensor=None, debug=False): | |
82 """Load all or part of file 'f' into a numpy ndarray | |
83 | |
84 If f is a string, it will be treated as a filename, and opened in read mode. | |
85 | |
86 If subtensor is not None, it should be like the argument to | |
87 numpy.ndarray.__getitem__. The following two expressions should return | |
88 equivalent ndarray objects, but the one on the left may be faster and more | |
89 memory efficient if the underlying file f is big. | |
90 | |
91 read(f, subtensor) <===> read(f)[*subtensor] | |
92 | |
93 Support for subtensors is currently spotty, so check the code to see if your | |
94 particular type of subtensor is supported. | |
95 | |
96 """ | |
97 | |
98 if isinstance(f, str): | |
99 if debug: print 'f', f | |
100 f = file(f, 'r') | |
101 | |
102 #what is the data type of this matrix? | |
103 #magic_s = f.read(4) | |
104 #magic = numpy.fromstring(magic_s, dtype='int32') | |
105 magic = _read_int32(f) | |
106 magic_t, elsize = _magic_dtype[magic] | |
107 if debug: | |
108 print 'header magic', magic, magic_t, elsize | |
109 if magic_t == 'packed matrix': | |
110 raise NotImplementedError('packed matrix not supported') | |
111 | |
112 #what is the rank of the tensor? | |
113 ndim = _read_int32(f) | |
114 if debug: print 'header ndim', ndim | |
115 | |
116 #what are the dimensions of the tensor? | |
117 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] | |
118 dim_size = prod(dim) | |
119 if debug: print 'header dim', dim, dim_size | |
120 | |
121 rval = None | |
122 if subtensor is None: | |
123 rval = read_ndarray(f, dim, magic_t) | |
124 elif isinstance(subtensor, slice): | |
125 if subtensor.step not in (None, 1): | |
126 raise NotImplementedError('slice with step', subtensor.step) | |
127 if subtensor.start not in (None, 0): | |
128 bytes_per_row = prod(dim[1:]) * elsize | |
129 raise NotImplementedError('slice with start', subtensor.start) | |
130 dim[0] = min(dim[0], subtensor.stop) | |
131 rval = read_ndarray(f, dim, magic_t) | |
132 else: | |
133 raise NotImplementedError('subtensor access not written yet:', subtensor) | |
134 | |
135 return rval | |
136 | |
137 def write(f, mat): | |
138 if isinstance(f, str): | |
139 f = file(f, 'w') | |
140 | |
141 _write_int32(f, _dtype_magic[str(mat.dtype)]) | |
142 _write_int32(f, len(mat.shape)) | |
143 shape = mat.shape | |
144 if len(shape) < 3: | |
145 shape = list(shape) + [1] * (3 - len(shape)) | |
146 print 'writing shape =', shape | |
147 for sh in shape: | |
148 _write_int32(f, sh) | |
149 mat.tofile(f) | |
150 | |
151 if __name__ == '__main__': | |
152 #a small test script, starts by reading sys.argv[1] | |
35 | 153 rval = read(sys.argv[1], None, debug=True) #load from filename |
33 | 154 print 'rval', rval.shape, rval.size |
155 | |
35 | 156 if 0: |
157 f = file('/tmp/some_mat', 'w'); | |
158 write(f, rval) | |
159 print '' | |
160 f.close() | |
161 f = file('/tmp/some_mat', 'r'); | |
162 rval2 = read(f) #load from file handle | |
163 print 'rval2', rval2.shape, rval2.size | |
33 | 164 |
35 | 165 assert rval.dtype == rval2.dtype |
166 assert rval.shape == rval2.shape | |
167 assert numpy.all(rval == rval2) | |
168 print 'ok' | |
33 | 169 |