Mercurial > pylearn
annotate amat.py @ 311:0690de82c338
a lot of tests are broken because of the new behaviour of __getitem__ that always returns a LookupList, working on that...
author | Thierry Bertin-Mahieux <bertinmt@iro.umontreal.ca> |
---|---|
date | Wed, 11 Jun 2008 12:45:24 -0400 |
parents | 6e69fb91f3c0 |
children | bd937e845bbb |
rev | line source |
---|---|
266
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
1 """load PLearn AMat files""" |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
2 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
3 import sys, numpy, array |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
4 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
5 path_MNIST = '/u/bergstrj/pub/data/mnist.amat' |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
6 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
7 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
8 class AMat: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
9 """DataSource to access a plearn amat file as a periodic unrandomized stream. |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
10 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
11 Attributes: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
12 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
13 input -- minibatch of input |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
14 target -- minibatch of target |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
15 weight -- minibatch of weight |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
16 extra -- minitbatch of extra |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
17 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
18 all -- the entire data contents of the amat file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
19 n_examples -- the number of training examples in the file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
20 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
21 AMat stands for Ascii Matri[x,ces] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
22 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
23 """ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
24 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
25 marker_size = '#size:' |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
26 marker_sizes = '#sizes:' |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
27 marker_col_names = '#:' |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
28 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
29 def __init__(self, path, head=None, update_interval=0, ofile=sys.stdout): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
30 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
31 """Load the amat at <path> into memory. |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
32 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
33 path - str: location of amat file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
34 head - int: stop reading after this many data rows |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
35 update_interval - int: print '.' to ofile every <this many> lines |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
36 ofile - file: print status, msgs, etc. to this file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
37 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
38 """ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
39 self.all = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
40 self.input = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
41 self.target = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
42 self.weight = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
43 self.extra = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
44 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
45 self.header = False |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
46 self.header_size = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
47 self.header_rows = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
48 self.header_cols = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
49 self.header_sizes = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
50 self.header_col_names = [] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
51 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
52 data_started = False |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
53 data = array.array('d') |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
54 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
55 f = open(path) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
56 n_data_lines = 0 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
57 len_float_line = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
58 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
59 for i,line in enumerate(f): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
60 if n_data_lines == head: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
61 #we've read enough data, |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
62 # break even if there's more in the file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
63 break |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
64 if len(line) == 0 or line == '\n': |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
65 continue |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
66 if line[0] == '#': |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
67 if not data_started: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
68 #the condition means that the file has a header, and we're on |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
69 # some header line |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
70 self.header = True |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
71 if line.startswith(AMat.marker_size): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
72 info = line[len(AMat.marker_size):] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
73 self.header_size = [int(s) for s in info.split()] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
74 self.header_rows, self.header_cols = self.header_size |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
75 if line.startswith(AMat.marker_col_names): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
76 info = line[len(AMat.marker_col_names):] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
77 self.header_col_names = info.split() |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
78 elif line.startswith(AMat.marker_sizes): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
79 info = line[len(AMat.marker_sizes):] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
80 self.header_sizes = [int(s) for s in info.split()] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
81 else: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
82 #the first non-commented line tells us that the header is done |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
83 data_started = True |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
84 float_line = [float(s) for s in line.split()] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
85 if len_float_line is None: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
86 len_float_line = len(float_line) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
87 if (self.header_cols is not None) \ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
88 and self.header_cols != len_float_line: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
89 print >> sys.stderr, \ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
90 'WARNING: header declared %i cols but first line has %i, using %i',\ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
91 self.header_cols, len_float_line, len_float_line |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
92 else: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
93 if len_float_line != len(float_line): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
94 raise IOError('wrong line length', i, line) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
95 data.extend(float_line) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
96 n_data_lines += 1 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
97 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
98 if update_interval > 0 and (ofile is not None) \ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
99 and n_data_lines % update_interval == 0: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
100 ofile.write('.') |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
101 ofile.flush() |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
102 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
103 if update_interval > 0: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
104 ofile.write('\n') |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
105 f.close() |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
106 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
107 # convert from array.array to numpy.ndarray |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
108 nshape = (len(data) / len_float_line, len_float_line) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
109 self.all = numpy.frombuffer(data).reshape(nshape) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
110 self.n_examples = self.all.shape[0] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
111 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
112 # assign |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
113 if self.header_sizes is not None: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
114 if len(self.header_sizes) > 4: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
115 print >> sys.stderr, 'WARNING: ignoring sizes after 4th in %s' % path |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
116 leftmost = 0 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
117 #here we make use of the fact that if header_sizes has len < 4 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
118 # the loop will exit before 4 iterations |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
119 attrlist = ['input', 'target', 'weight', 'extra'] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
120 for attr, ncols in zip(attrlist, self.header_sizes): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
121 setattr(self, attr, self.all[:, leftmost:leftmost+ncols]) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
122 leftmost += ncols |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
123 |