annotate filetensor.py @ 469:4335309f4924

Split into preprocess for words and sequences
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 21 Oct 2008 16:32:06 -0400
parents 040cb796f4e0
children
rev   line source
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
1 """
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
2 Read and write the matrix file format described at
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
3 U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html}
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
4
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
5 The format is for dense tensors:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
6
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
7 - magic number indicating type and endianness - 4bytes
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
8 - rank of tensor - int32
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
9 - dimensions - int32, int32, int32, ...
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
10 - <data>
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
11
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
12 The number of dimensions and rank is slightly tricky:
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
13 - for scalar: rank=0, dimensions = [1, 1, 1]
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
14 - for vector: rank=1, dimensions = [?, 1, 1]
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
15 - for matrix: rank=2, dimensions = [?, ?, 1]
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
16
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
17 For rank >= 3, the number of dimensions matches the rank exactly.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
18
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
19
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
20 @todo: add complex type support
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
21
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
22 """
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
23 import sys
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
24 import numpy
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
25
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
26 def _prod(lst):
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
27 p = 1
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
28 for l in lst:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
29 p *= l
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
30 return p
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
31
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
32 _magic_dtype = {
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
33 0x1E3D4C51 : ('float32', 4),
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
34 #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix?
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
35 0x1E3D4C53 : ('float64', 8),
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
36 0x1E3D4C54 : ('int32', 4),
35
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
37 0x1E3D4C55 : ('uint8', 1),
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
38 0x1E3D4C56 : ('int16', 2),
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
39 }
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
40 _dtype_magic = {
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
41 'float32': 0x1E3D4C51,
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
42 #'packed matrix': 0x1E3D4C52,
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
43 'float64': 0x1E3D4C53,
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
44 'int32': 0x1E3D4C54,
35
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
45 'uint8': 0x1E3D4C55,
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
46 'int16': 0x1E3D4C56
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
47 }
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
48
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
49 #
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
50 # TODO: implement item selection:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
51 # e.g. load('some mat', subtensor=(:6, 2:5))
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
52 #
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
53 # This function should be memory efficient by:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
54 # - allocating an output matrix at the beginning
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
55 # - seeking through the file, reading subtensors from multiple places
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
56 def read(f, subtensor=None, debug=False):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
57 """Load all or part of file 'f' into a numpy ndarray
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
58
420
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
59 @param f: file from which to read
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
60 @type f: file-like object
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
61
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
62 If subtensor is not None, it should be like the argument to
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
63 numpy.ndarray.__getitem__. The following two expressions should return
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
64 equivalent ndarray objects, but the one on the left may be faster and more
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
65 memory efficient if the underlying file f is big.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
66
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
67 read(f, subtensor) <===> read(f)[*subtensor]
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
68
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
69 Support for subtensors is currently spotty, so check the code to see if your
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
70 particular type of subtensor is supported.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
71
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
72 """
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
73 def _read_int32(f):
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
74 s = f.read(4)
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
75 s_array = numpy.fromstring(s, dtype='int32')
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
76 return s_array.item()
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
77
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
78 #what is the data type of this matrix?
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
79 #magic_s = f.read(4)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
80 #magic = numpy.fromstring(magic_s, dtype='int32')
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
81 magic = _read_int32(f)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
82 magic_t, elsize = _magic_dtype[magic]
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
83 if debug:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
84 print 'header magic', magic, magic_t, elsize
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
85 if magic_t == 'packed matrix':
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
86 raise NotImplementedError('packed matrix not supported')
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
87
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
88 #what is the rank of the tensor?
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
89 ndim = _read_int32(f)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
90 if debug: print 'header ndim', ndim
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
91
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
92 #what are the dimensions of the tensor?
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
93 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
94 dim_size = _prod(dim)
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
95 if debug: print 'header dim', dim, dim_size
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
96
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
97 rval = None
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
98 if subtensor is None:
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
99 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim)
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
100 elif isinstance(subtensor, slice):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
101 if subtensor.step not in (None, 1):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
102 raise NotImplementedError('slice with step', subtensor.step)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
103 if subtensor.start not in (None, 0):
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
104 bytes_per_row = _prod(dim[1:]) * elsize
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
105 raise NotImplementedError('slice with start', subtensor.start)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
106 dim[0] = min(dim[0], subtensor.stop)
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
107 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim)
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
108 else:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
109 raise NotImplementedError('subtensor access not written yet:', subtensor)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
110
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
111 return rval
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
112
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
113 def write(f, mat):
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
114 """Write a numpy.ndarray to file.
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
115
420
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
116 @param f: file into which to write
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
117 @type f: file-like object
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
118
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
119 @param mat: array to write to file
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
120 @type mat: numpy ndarray or compatible
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
121
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
122 """
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
123 def _write_int32(f, i):
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
124 i_array = numpy.asarray(i, dtype='int32')
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
125 if 0: print 'writing int32', i, i_array
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
126 i_array.tofile(f)
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
127
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
128 try:
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
129 _write_int32(f, _dtype_magic[str(mat.dtype)])
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
130 except KeyError:
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
131 raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype)
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
132
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
133 _write_int32(f, len(mat.shape))
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
134 shape = mat.shape
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
135 if len(shape) < 3:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
136 shape = list(shape) + [1] * (3 - len(shape))
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
137 if 0: print 'writing shape =', shape
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
138 for sh in shape:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
139 _write_int32(f, sh)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
140 mat.tofile(f)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
141