Mercurial > pylearn
annotate filetensor.py @ 99:a8da709eb6a9
in ArrayDataSet.__init__ if a columns is an index, we change it to be a list that containt only this index. This way, we remove the special case where the columns is an index for all subsequent call.
This was possing trouble with numpy.vstack() called by MinibatchWrapAroundIterator.next
author | Frederic Bastien <bastienf@iro.umontreal.ca> |
---|---|
date | Tue, 06 May 2008 13:57:36 -0400 |
parents | 2b6656b2ef52 |
children | 82ba488b2c24 |
rev | line source |
---|---|
33 | 1 """ |
2 Read and write the matrix file format described at | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
3 U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} |
33 | 4 |
5 The format is for dense tensors: | |
6 | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
7 - magic number indicating type and endianness - 4bytes |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
8 - rank of tensor - int32 |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
9 - dimensions - int32, int32, int32, ... |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
10 - <data> |
33 | 11 |
12 The number of dimensions and rank is slightly tricky: | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
13 - for scalar: rank=0, dimensions = [1, 1, 1] |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
14 - for vector: rank=1, dimensions = [?, 1, 1] |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
15 - for matrix: rank=2, dimensions = [?, ?, 1] |
33 | 16 |
17 For rank >= 3, the number of dimensions matches the rank exactly. | |
18 | |
19 """ | |
20 import sys | |
21 import numpy | |
22 | |
23 def prod(lst): | |
24 p = 1 | |
25 for l in lst: | |
26 p *= l | |
27 return p | |
28 | |
29 _magic_dtype = { | |
30 0x1E3D4C51 : ('float32', 4), | |
31 0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? | |
32 0x1E3D4C53 : ('float64', 8), | |
33 0x1E3D4C54 : ('int32', 4), | |
35 | 34 0x1E3D4C55 : ('uint8', 1), |
33 | 35 0x1E3D4C56 : ('int16', 2), |
36 } | |
37 _dtype_magic = { | |
38 'float32': 0x1E3D4C51, | |
39 'packed matrix': 0x1E3D4C52, | |
40 'float64': 0x1E3D4C53, | |
41 'int32': 0x1E3D4C54, | |
35 | 42 'uint8': 0x1E3D4C55, |
33 | 43 'int16': 0x1E3D4C56 |
44 } | |
45 | |
46 def _unused(): | |
47 f.seek(0,2) #seek to end | |
48 f_len = f.tell() | |
49 f.seek(f_data_start,0) #seek back to where we were | |
50 | |
51 if debug: print 'length:', f_len | |
52 | |
53 | |
54 f_data_bytes = (f_len - f_data_start) | |
55 | |
56 if debug: print 'data bytes according to header: ', dim_size * elsize | |
57 if debug: print 'data bytes according to file : ', f_data_bytes | |
58 | |
59 if debug: print 'reading data...' | |
60 sys.stdout.flush() | |
61 | |
62 def _write_int32(f, i): | |
63 i_array = numpy.asarray(i, dtype='int32') | |
64 if 0: print 'writing int32', i, i_array | |
65 i_array.tofile(f) | |
66 def _read_int32(f): | |
67 s = f.read(4) | |
68 s_array = numpy.fromstring(s, dtype='int32') | |
69 return s_array.item() | |
70 | |
71 def read_ndarray(f, dim, dtype): | |
72 return numpy.fromfile(f, dtype=dtype, count=prod(dim)).reshape(dim) | |
73 | |
74 # | |
75 # TODO: implement item selection: | |
76 # e.g. load('some mat', subtensor=(:6, 2:5)) | |
77 # | |
78 # This function should be memory efficient by: | |
79 # - allocating an output matrix at the beginning | |
80 # - seeking through the file, reading subtensors from multiple places | |
81 def read(f, subtensor=None, debug=False): | |
82 """Load all or part of file 'f' into a numpy ndarray | |
83 | |
84 If f is a string, it will be treated as a filename, and opened in read mode. | |
85 | |
86 If subtensor is not None, it should be like the argument to | |
87 numpy.ndarray.__getitem__. The following two expressions should return | |
88 equivalent ndarray objects, but the one on the left may be faster and more | |
89 memory efficient if the underlying file f is big. | |
90 | |
91 read(f, subtensor) <===> read(f)[*subtensor] | |
92 | |
93 Support for subtensors is currently spotty, so check the code to see if your | |
94 particular type of subtensor is supported. | |
95 | |
96 """ | |
97 | |
98 if isinstance(f, str): | |
99 if debug: print 'f', f | |
100 f = file(f, 'r') | |
101 | |
102 #what is the data type of this matrix? | |
103 #magic_s = f.read(4) | |
104 #magic = numpy.fromstring(magic_s, dtype='int32') | |
105 magic = _read_int32(f) | |
106 magic_t, elsize = _magic_dtype[magic] | |
107 if debug: | |
108 print 'header magic', magic, magic_t, elsize | |
109 if magic_t == 'packed matrix': | |
110 raise NotImplementedError('packed matrix not supported') | |
111 | |
112 #what is the rank of the tensor? | |
113 ndim = _read_int32(f) | |
114 if debug: print 'header ndim', ndim | |
115 | |
116 #what are the dimensions of the tensor? | |
117 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] | |
118 dim_size = prod(dim) | |
119 if debug: print 'header dim', dim, dim_size | |
120 | |
121 rval = None | |
122 if subtensor is None: | |
123 rval = read_ndarray(f, dim, magic_t) | |
124 elif isinstance(subtensor, slice): | |
125 if subtensor.step not in (None, 1): | |
126 raise NotImplementedError('slice with step', subtensor.step) | |
127 if subtensor.start not in (None, 0): | |
128 bytes_per_row = prod(dim[1:]) * elsize | |
129 raise NotImplementedError('slice with start', subtensor.start) | |
130 dim[0] = min(dim[0], subtensor.stop) | |
131 rval = read_ndarray(f, dim, magic_t) | |
132 else: | |
133 raise NotImplementedError('subtensor access not written yet:', subtensor) | |
134 | |
135 return rval | |
136 | |
137 def write(f, mat): | |
138 if isinstance(f, str): | |
139 f = file(f, 'w') | |
140 | |
141 _write_int32(f, _dtype_magic[str(mat.dtype)]) | |
142 _write_int32(f, len(mat.shape)) | |
143 shape = mat.shape | |
144 if len(shape) < 3: | |
145 shape = list(shape) + [1] * (3 - len(shape)) | |
146 print 'writing shape =', shape | |
147 for sh in shape: | |
148 _write_int32(f, sh) | |
149 mat.tofile(f) | |
150 | |
151 if __name__ == '__main__': | |
152 #a small test script, starts by reading sys.argv[1] | |
35 | 153 rval = read(sys.argv[1], None, debug=True) #load from filename |
33 | 154 print 'rval', rval.shape, rval.size |
155 | |
35 | 156 if 0: |
157 f = file('/tmp/some_mat', 'w'); | |
158 write(f, rval) | |
159 print '' | |
160 f.close() | |
161 f = file('/tmp/some_mat', 'r'); | |
162 rval2 = read(f) #load from file handle | |
163 print 'rval2', rval2.shape, rval2.size | |
33 | 164 |
35 | 165 assert rval.dtype == rval2.dtype |
166 assert rval.shape == rval2.shape | |
167 assert numpy.all(rval == rval2) | |
168 print 'ok' | |
33 | 169 |