Mercurial > pylearn
annotate amat.py @ 470:bd937e845bbb
new stuff: algorithms/logistic_regression, datasets/MNIST
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Wed, 22 Oct 2008 15:56:53 -0400 |
parents | 6e69fb91f3c0 |
children |
rev | line source |
---|---|
266
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
1 """load PLearn AMat files""" |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
2 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
3 import sys, numpy, array |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
4 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
5 class AMat: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
6 """DataSource to access a plearn amat file as a periodic unrandomized stream. |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
7 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
8 Attributes: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
9 |
470
bd937e845bbb
new stuff: algorithms/logistic_regression, datasets/MNIST
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
266
diff
changeset
|
10 input -- all columns of input |
bd937e845bbb
new stuff: algorithms/logistic_regression, datasets/MNIST
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
266
diff
changeset
|
11 target -- all columns of target |
bd937e845bbb
new stuff: algorithms/logistic_regression, datasets/MNIST
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
266
diff
changeset
|
12 weight -- all columns of weight |
bd937e845bbb
new stuff: algorithms/logistic_regression, datasets/MNIST
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
266
diff
changeset
|
13 extra -- all columns of extra |
266
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
14 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
15 all -- the entire data contents of the amat file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
16 n_examples -- the number of training examples in the file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
17 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
18 AMat stands for Ascii Matri[x,ces] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
19 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
20 """ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
21 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
22 marker_size = '#size:' |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
23 marker_sizes = '#sizes:' |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
24 marker_col_names = '#:' |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
25 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
26 def __init__(self, path, head=None, update_interval=0, ofile=sys.stdout): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
27 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
28 """Load the amat at <path> into memory. |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
29 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
30 path - str: location of amat file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
31 head - int: stop reading after this many data rows |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
32 update_interval - int: print '.' to ofile every <this many> lines |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
33 ofile - file: print status, msgs, etc. to this file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
34 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
35 """ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
36 self.all = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
37 self.input = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
38 self.target = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
39 self.weight = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
40 self.extra = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
41 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
42 self.header = False |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
43 self.header_size = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
44 self.header_rows = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
45 self.header_cols = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
46 self.header_sizes = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
47 self.header_col_names = [] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
48 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
49 data_started = False |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
50 data = array.array('d') |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
51 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
52 f = open(path) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
53 n_data_lines = 0 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
54 len_float_line = None |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
55 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
56 for i,line in enumerate(f): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
57 if n_data_lines == head: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
58 #we've read enough data, |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
59 # break even if there's more in the file |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
60 break |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
61 if len(line) == 0 or line == '\n': |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
62 continue |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
63 if line[0] == '#': |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
64 if not data_started: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
65 #the condition means that the file has a header, and we're on |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
66 # some header line |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
67 self.header = True |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
68 if line.startswith(AMat.marker_size): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
69 info = line[len(AMat.marker_size):] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
70 self.header_size = [int(s) for s in info.split()] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
71 self.header_rows, self.header_cols = self.header_size |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
72 if line.startswith(AMat.marker_col_names): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
73 info = line[len(AMat.marker_col_names):] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
74 self.header_col_names = info.split() |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
75 elif line.startswith(AMat.marker_sizes): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
76 info = line[len(AMat.marker_sizes):] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
77 self.header_sizes = [int(s) for s in info.split()] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
78 else: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
79 #the first non-commented line tells us that the header is done |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
80 data_started = True |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
81 float_line = [float(s) for s in line.split()] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
82 if len_float_line is None: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
83 len_float_line = len(float_line) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
84 if (self.header_cols is not None) \ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
85 and self.header_cols != len_float_line: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
86 print >> sys.stderr, \ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
87 'WARNING: header declared %i cols but first line has %i, using %i',\ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
88 self.header_cols, len_float_line, len_float_line |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
89 else: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
90 if len_float_line != len(float_line): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
91 raise IOError('wrong line length', i, line) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
92 data.extend(float_line) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
93 n_data_lines += 1 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
94 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
95 if update_interval > 0 and (ofile is not None) \ |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
96 and n_data_lines % update_interval == 0: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
97 ofile.write('.') |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
98 ofile.flush() |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
99 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
100 if update_interval > 0: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
101 ofile.write('\n') |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
102 f.close() |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
103 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
104 # convert from array.array to numpy.ndarray |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
105 nshape = (len(data) / len_float_line, len_float_line) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
106 self.all = numpy.frombuffer(data).reshape(nshape) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
107 self.n_examples = self.all.shape[0] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
108 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
109 # assign |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
110 if self.header_sizes is not None: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
111 if len(self.header_sizes) > 4: |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
112 print >> sys.stderr, 'WARNING: ignoring sizes after 4th in %s' % path |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
113 leftmost = 0 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
114 #here we make use of the fact that if header_sizes has len < 4 |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
115 # the loop will exit before 4 iterations |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
116 attrlist = ['input', 'target', 'weight', 'extra'] |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
117 for attr, ncols in zip(attrlist, self.header_sizes): |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
118 setattr(self, attr, self.all[:, leftmost:leftmost+ncols]) |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
119 leftmost += ncols |
6e69fb91f3c0
initial commit of amat
James Bergstra <bergstrj@iro.umontreal.ca>
parents:
diff
changeset
|
120 |