Mercurial > pylearn
annotate filetensor.py @ 451:d99fefbc9324
Added a KL-divergence.
author | Joseph Turian <turian@gmail.com> |
---|---|
date | Thu, 04 Sep 2008 14:46:30 -0400 |
parents | 040cb796f4e0 |
children |
rev | line source |
---|---|
33 | 1 """ |
2 Read and write the matrix file format described at | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
3 U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} |
33 | 4 |
5 The format is for dense tensors: | |
6 | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
7 - magic number indicating type and endianness - 4bytes |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
8 - rank of tensor - int32 |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
9 - dimensions - int32, int32, int32, ... |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
10 - <data> |
33 | 11 |
12 The number of dimensions and rank is slightly tricky: | |
72
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
13 - for scalar: rank=0, dimensions = [1, 1, 1] |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
14 - for vector: rank=1, dimensions = [?, 1, 1] |
2b6656b2ef52
Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents:
35
diff
changeset
|
15 - for matrix: rank=2, dimensions = [?, ?, 1] |
33 | 16 |
17 For rank >= 3, the number of dimensions matches the rank exactly. | |
18 | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
19 |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
20 @todo: add complex type support |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
21 |
33 | 22 """ |
23 import sys | |
24 import numpy | |
25 | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
26 def _prod(lst): |
33 | 27 p = 1 |
28 for l in lst: | |
29 p *= l | |
30 return p | |
31 | |
32 _magic_dtype = { | |
33 0x1E3D4C51 : ('float32', 4), | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
34 #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix? |
33 | 35 0x1E3D4C53 : ('float64', 8), |
36 0x1E3D4C54 : ('int32', 4), | |
35 | 37 0x1E3D4C55 : ('uint8', 1), |
33 | 38 0x1E3D4C56 : ('int16', 2), |
39 } | |
40 _dtype_magic = { | |
41 'float32': 0x1E3D4C51, | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
42 #'packed matrix': 0x1E3D4C52, |
33 | 43 'float64': 0x1E3D4C53, |
44 'int32': 0x1E3D4C54, | |
35 | 45 'uint8': 0x1E3D4C55, |
33 | 46 'int16': 0x1E3D4C56 |
47 } | |
48 | |
49 # | |
50 # TODO: implement item selection: | |
51 # e.g. load('some mat', subtensor=(:6, 2:5)) | |
52 # | |
53 # This function should be memory efficient by: | |
54 # - allocating an output matrix at the beginning | |
55 # - seeking through the file, reading subtensors from multiple places | |
56 def read(f, subtensor=None, debug=False): | |
57 """Load all or part of file 'f' into a numpy ndarray | |
58 | |
420
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
59 @param f: file from which to read |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
60 @type f: file-like object |
33 | 61 |
62 If subtensor is not None, it should be like the argument to | |
63 numpy.ndarray.__getitem__. The following two expressions should return | |
64 equivalent ndarray objects, but the one on the left may be faster and more | |
65 memory efficient if the underlying file f is big. | |
66 | |
67 read(f, subtensor) <===> read(f)[*subtensor] | |
68 | |
69 Support for subtensors is currently spotty, so check the code to see if your | |
70 particular type of subtensor is supported. | |
71 | |
72 """ | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
73 def _read_int32(f): |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
74 s = f.read(4) |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
75 s_array = numpy.fromstring(s, dtype='int32') |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
76 return s_array.item() |
33 | 77 |
78 #what is the data type of this matrix? | |
79 #magic_s = f.read(4) | |
80 #magic = numpy.fromstring(magic_s, dtype='int32') | |
81 magic = _read_int32(f) | |
82 magic_t, elsize = _magic_dtype[magic] | |
83 if debug: | |
84 print 'header magic', magic, magic_t, elsize | |
85 if magic_t == 'packed matrix': | |
86 raise NotImplementedError('packed matrix not supported') | |
87 | |
88 #what is the rank of the tensor? | |
89 ndim = _read_int32(f) | |
90 if debug: print 'header ndim', ndim | |
91 | |
92 #what are the dimensions of the tensor? | |
93 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim] | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
94 dim_size = _prod(dim) |
33 | 95 if debug: print 'header dim', dim, dim_size |
96 | |
97 rval = None | |
98 if subtensor is None: | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
99 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) |
33 | 100 elif isinstance(subtensor, slice): |
101 if subtensor.step not in (None, 1): | |
102 raise NotImplementedError('slice with step', subtensor.step) | |
103 if subtensor.start not in (None, 0): | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
104 bytes_per_row = _prod(dim[1:]) * elsize |
33 | 105 raise NotImplementedError('slice with start', subtensor.start) |
106 dim[0] = min(dim[0], subtensor.stop) | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
107 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim) |
33 | 108 else: |
109 raise NotImplementedError('subtensor access not written yet:', subtensor) | |
110 | |
111 return rval | |
112 | |
113 def write(f, mat): | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
114 """Write a numpy.ndarray to file. |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
115 |
420
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
116 @param f: file into which to write |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
117 @type f: file-like object |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
118 |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
119 @param mat: array to write to file |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
120 @type mat: numpy ndarray or compatible |
040cb796f4e0
Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
248
diff
changeset
|
121 |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
122 """ |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
123 def _write_int32(f, i): |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
124 i_array = numpy.asarray(i, dtype='int32') |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
125 if 0: print 'writing int32', i, i_array |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
126 i_array.tofile(f) |
33 | 127 |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
128 try: |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
129 _write_int32(f, _dtype_magic[str(mat.dtype)]) |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
130 except KeyError: |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
131 raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype) |
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
132 |
33 | 133 _write_int32(f, len(mat.shape)) |
134 shape = mat.shape | |
135 if len(shape) < 3: | |
136 shape = list(shape) + [1] * (3 - len(shape)) | |
248
82ba488b2c24
polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
72
diff
changeset
|
137 if 0: print 'writing shape =', shape |
33 | 138 for sh in shape: |
139 _write_int32(f, sh) | |
140 mat.tofile(f) | |
141 |