annotate pylearn/io/filetensor.py @ 1412:e1b5092b4a53

allow to read gzip filetensor when there is more then 1 tensor in the file.
author Frederic Bastien <nouiz@nouiz.org>
date Wed, 02 Feb 2011 14:33:01 -0500
parents e06c0ff46d2a
children a6e634b83d88
rev   line source
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
1 """
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
2 Read and write the matrix file format described at
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
3 U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html}
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
4
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
5 The format is for dense tensors:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
6
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
7 - magic number indicating type and endianness - 4bytes
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
8 - rank of tensor - int32
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
9 - dimensions - int32, int32, int32, ...
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
10 - <data>
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
11
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
12 The number of dimensions and rank is slightly tricky:
72
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
13 - for scalar: rank=0, dimensions = [1, 1, 1]
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
14 - for vector: rank=1, dimensions = [?, 1, 1]
2b6656b2ef52 Changed docs slightly
Joseph Turian <turian@iro.umontreal.ca>
parents: 35
diff changeset
15 - for matrix: rank=2, dimensions = [?, ?, 1]
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
16
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
17 For rank >= 3, the number of dimensions matches the rank exactly.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
18
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
19
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
20 @todo: add complex type support
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
21
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
22 """
1401
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
23 import gzip
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
24
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
25 import numpy
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
26
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
27 def _prod(lst):
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
28 p = 1
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
29 for l in lst:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
30 p *= l
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
31 return p
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
32
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
33 _magic_dtype = {
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
34 0x1E3D4C51 : ('float32', 4),
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
35 #0x1E3D4C52 : ('packed matrix', 0), #what is a packed matrix?
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
36 0x1E3D4C53 : ('float64', 8),
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
37 0x1E3D4C54 : ('int32', 4),
35
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
38 0x1E3D4C55 : ('uint8', 1),
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
39 0x1E3D4C56 : ('int16', 2),
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
40 }
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
41 _dtype_magic = {
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
42 'float32': 0x1E3D4C51,
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
43 #'packed matrix': 0x1E3D4C52,
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
44 'float64': 0x1E3D4C53,
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
45 'int32': 0x1E3D4C54,
35
2508c373cf29 bug fix for reading byte matrices
bergstrj@iro.umontreal.ca
parents: 33
diff changeset
46 'uint8': 0x1E3D4C55,
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
47 'int16': 0x1E3D4C56
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
48 }
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
49
693
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
50 def _read_int32(f):
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
51 """unpack a 4-byte integer from the current position in file f"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
52 s = f.read(4)
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
53 s_array = numpy.fromstring(s, dtype='int32')
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
54 return s_array.item()
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
55
1401
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
56 def _read_header(f, debug=False, fromgzip=None):
693
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
57 """
1401
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
58 :param f: an open file handle.
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
59 :type f: a file or gzip.GzipFile object
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
60
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
61 :param fromgzip: bool or None
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
62 :type fromgzip: if None determine the type of file handle.
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
63
693
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
64 :returns: data type, element size, rank, shape, size
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
65 """
1401
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
66 if fromgzip is None:
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
67 fromgzip = isinstance(f, gzip.GzipFile)
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
68
693
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
69 #what is the data type of this matrix?
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
70 #magic_s = f.read(4)
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
71 #magic = numpy.fromstring(magic_s, dtype='int32')
1394
48e8292e22e7 new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents: 1393
diff changeset
72 magic = _read_int32(f)
693
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
73 magic_t, elsize = _magic_dtype[magic]
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
74 if debug:
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
75 print 'header magic', magic, magic_t, elsize
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
76 if magic_t == 'packed matrix':
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
77 raise NotImplementedError('packed matrix not supported')
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
78
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
79 #what is the rank of the tensor?
1394
48e8292e22e7 new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents: 1393
diff changeset
80 ndim = _read_int32(f)
693
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
81 if debug: print 'header ndim', ndim
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
82
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
83 #what are the dimensions of the tensor?
1394
48e8292e22e7 new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents: 1393
diff changeset
84 if fromgzip:
48e8292e22e7 new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents: 1393
diff changeset
85 d = f.read(max(ndim,3)*4)
48e8292e22e7 new version _read_header for filetensor that allow to work gzip file.
Frederic Bastien <nouiz@nouiz.org>
parents: 1393
diff changeset
86 dim = numpy.fromstring(d, dtype='int32')[:ndim]
1393
8ecc6da87350 allow to read header of dataset from a string.
Frederic Bastien <nouiz@nouiz.org>
parents: 894
diff changeset
87 else:
8ecc6da87350 allow to read header of dataset from a string.
Frederic Bastien <nouiz@nouiz.org>
parents: 894
diff changeset
88 dim = numpy.fromfile(f, dtype='int32', count=max(ndim,3))[:ndim]
693
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
89 dim_size = _prod(dim)
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
90 if debug: print 'header dim', dim, dim_size
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
91
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
92 return magic_t, elsize, ndim, dim, dim_size
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
93
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
94 class arraylike(object):
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
95 """Provide an array-like interface to the filetensor in f.
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
96
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
97 The rank parameter to __init__ controls how this object interprets the underlying tensor.
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
98 Its behaviour should be clear from the following example.
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
99 Suppose the underlying tensor is MxNxK.
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
100
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
101 - If rank is 0, self[i] will be a scalar and len(self) == M*N*K.
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
102
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
103 - If rank is 1, self[i] is a vector of length K, and len(self) == M*N.
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
104
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
105 - If rank is 3, self[i] is a 3D tensor of size MxNxK, and len(self)==1.
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
106
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
107 - If rank is 5, self[i] is a 5D tensor of size 1x1xMxNxK, and len(self) == 1.
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
108
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
109
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
110 :note: Objects of this class generally require exclusive use of the underlying file handle, because
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
111 they call seek() every time you access an element.
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
112 """
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
113
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
114 f = None
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
115 """File-like object"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
116
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
117 magic_t = None
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
118 """numpy data type of array"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
119
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
120 elsize = None
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
121 """number of bytes per scalar element"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
122
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
123 ndim = None
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
124 """Rank of underlying tensor"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
125
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
126 dim = None
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
127 """tuple of array dimensions (aka shape)"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
128
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
129 dim_size = None
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
130 """number of scalars in the tensor (prod of dim)"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
131
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
132 f_start = None
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
133 """The file position of the first element of the tensor"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
134
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
135 readshape = None
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
136 """tuple of array dimensions of the block that we read"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
137
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
138 readsize = None
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
139 """number of elements we must read for each block"""
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
140
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
141 def __init__(self, f, rank=0, debug=False):
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
142 self.f = f
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
143 self.magic_t, self.elsize, self.ndim, self.dim, self.dim_size = _read_header(f,debug)
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
144 self.f_start = f.tell()
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
145
818
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
146 if rank <= self.ndim:
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
147 self.readshape = tuple(self.dim[self.ndim-rank:])
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
148 else:
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
149 self.readshape = tuple(self.dim)
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
150
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
151 #self.readshape = tuple(self.dim[self.ndim-rank:]) if rank <= self.ndim else tuple(self.dim)
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
152
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
153 if rank <= self.ndim:
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
154 padding = tuple()
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
155 else:
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
156 padding = (1,) * (rank - self.ndim)
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
157
f4729745bb58 backporting to 2.4
dumitru@deepnets.mtv.corp.google.com
parents: 693
diff changeset
158 #padding = tuple() if rank <= self.ndim else (1,) * (rank - self.ndim)
693
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
159 self.returnshape = padding + self.readshape
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
160 self.readsize = _prod(self.readshape)
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
161 if debug: print 'READ PARAM', self.readshape, self.returnshape, self.readsize
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
162
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
163 def __len__(self):
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
164 return _prod(self.dim[:self.ndim-len(self.readshape)])
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
165
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
166 def __getitem__(self, idx):
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
167 if idx >= len(self):
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
168 raise IndexError(idx)
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
169 self.f.seek(self.f_start + idx * self.elsize * self.readsize)
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
170 return numpy.fromfile(self.f,
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
171 dtype=self.magic_t,
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
172 count=self.readsize).reshape(self.returnshape)
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
173
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
174
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
175 #
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
176 # TODO: implement item selection:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
177 # e.g. load('some mat', subtensor=(:6, 2:5))
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
178 #
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
179 # This function should be memory efficient by:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
180 # - allocating an output matrix at the beginning
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
181 # - seeking through the file, reading subtensors from multiple places
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
182 def read(f, subtensor=None, debug=False):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
183 """Load all or part of file 'f' into a numpy ndarray
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
184
420
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
185 @param f: file from which to read
1401
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
186 @type f: file-like object. Can be a gzip open file.
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
187
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
188 If subtensor is not None, it should be like the argument to
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
189 numpy.ndarray.__getitem__. The following two expressions should return
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
190 equivalent ndarray objects, but the one on the left may be faster and more
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
191 memory efficient if the underlying file f is big.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
192
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
193 read(f, subtensor) <===> read(f)[*subtensor]
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
194
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
195 Support for subtensors is currently spotty, so check the code to see if your
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
196 particular type of subtensor is supported.
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
197
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
198 """
693
ee7026de9681 added filetensor.arraylike
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 537
diff changeset
199 magic_t, elsize, ndim, dim, dim_size = _read_header(f,debug)
894
032911ac4941 Add option to use subtensor start in pylearn.io.filetensor
Arnaud Bergeron <abergeron@gmail.com>
parents: 818
diff changeset
200 f_start = f.tell()
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
201
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
202 rval = None
1401
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
203 if isinstance(f, gzip.GzipFile):
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
204 assert subtensor is None, "Not implemented the subtensor case for gzip file"
1412
e1b5092b4a53 allow to read gzip filetensor when there is more then 1 tensor in the file.
Frederic Bastien <nouiz@nouiz.org>
parents: 1401
diff changeset
205 d = f.read(_prod(dim)*elsize)
e1b5092b4a53 allow to read gzip filetensor when there is more then 1 tensor in the file.
Frederic Bastien <nouiz@nouiz.org>
parents: 1401
diff changeset
206 rval = numpy.fromstring(d, dtype=magic_t).reshape(dim)
1401
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
207 del d
e06c0ff46d2a allow pylearn.io.filetensor to accept gzip file handle.
Frederic Bastien <nouiz@nouiz.org>
parents: 1394
diff changeset
208 elif subtensor is None:
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
209 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim)
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
210 elif isinstance(subtensor, slice):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
211 if subtensor.step not in (None, 1):
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
212 raise NotImplementedError('slice with step', subtensor.step)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
213 if subtensor.start not in (None, 0):
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
214 bytes_per_row = _prod(dim[1:]) * elsize
894
032911ac4941 Add option to use subtensor start in pylearn.io.filetensor
Arnaud Bergeron <abergeron@gmail.com>
parents: 818
diff changeset
215 f.seek(f_start+subtensor.start * bytes_per_row)
032911ac4941 Add option to use subtensor start in pylearn.io.filetensor
Arnaud Bergeron <abergeron@gmail.com>
parents: 818
diff changeset
216 dim[0] = min(dim[0], subtensor.stop) - subtensor.start
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
217 rval = numpy.fromfile(f, dtype=magic_t, count=_prod(dim)).reshape(dim)
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
218 else:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
219 raise NotImplementedError('subtensor access not written yet:', subtensor)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
220
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
221 return rval
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
222
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
223 def write(f, mat):
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
224 """Write a numpy.ndarray to file.
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
225
420
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
226 @param f: file into which to write
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
227 @type f: file-like object
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
228
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
229 @param mat: array to write to file
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
230 @type mat: numpy ndarray or compatible
040cb796f4e0 Removed feature of passing the file as a pathname from filetensor.{read, write}
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 248
diff changeset
231
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
232 """
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
233 def _write_int32(f, i):
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
234 i_array = numpy.asarray(i, dtype='int32')
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
235 if 0: print 'writing int32', i, i_array
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
236 i_array.tofile(f)
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
237
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
238 try:
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
239 _write_int32(f, _dtype_magic[str(mat.dtype)])
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
240 except KeyError:
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
241 raise TypeError('Invalid ndarray dtype for filetensor format', mat.dtype)
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
242
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
243 _write_int32(f, len(mat.shape))
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
244 shape = mat.shape
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
245 if len(shape) < 3:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
246 shape = list(shape) + [1] * (3 - len(shape))
248
82ba488b2c24 polished filetensor a little
James Bergstra <bergstrj@iro.umontreal.ca>
parents: 72
diff changeset
247 if 0: print 'writing shape =', shape
33
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
248 for sh in shape:
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
249 _write_int32(f, sh)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
250 mat.tofile(f)
bb92087cb0f6 added filetensor.py
bergstrj@iro.umontreal.ca
parents:
diff changeset
251