Mercurial > pylearn
annotate embeddings/process.py @ 672:27b1344a57b1
Added preprocessing back in
author | Joseph Turian <turian@gmail.com> |
---|---|
date | Thu, 20 Nov 2008 06:38:06 -0500 |
parents | c6563c629984 |
children |
rev | line source |
---|---|
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
1 """ |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
2 Read in the weights file |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
3 """ |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
4 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
5 import string |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
6 import sys |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
7 |
461 | 8 from parameters import * |
9 | |
10 __words = None | |
11 __word_to_embedding = None | |
12 __read = False | |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
13 |
531
90a76a8238e8
Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents:
469
diff
changeset
|
14 def length(): |
90a76a8238e8
Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents:
469
diff
changeset
|
15 """ |
90a76a8238e8
Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents:
469
diff
changeset
|
16 @return: The length of embeddings |
90a76a8238e8
Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents:
469
diff
changeset
|
17 """ |
534
eaa5ad4089a1
Another bugfix in pylearn.embeddings.length()
Joseph Turian <turian@gmail.com>
parents:
533
diff
changeset
|
18 return len(__word_to_embedding[__words[0]]) |
531
90a76a8238e8
Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents:
469
diff
changeset
|
19 |
461 | 20 def word_to_embedding(w): |
21 read_embeddings() | |
22 return __word_to_embedding[w] | |
23 | |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
24 def read_embeddings(): |
461 | 25 global __words |
26 global __word_to_embedding | |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
27 global __read |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
28 if __read: return |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
29 |
461 | 30 __words = [string.strip(w) for w in open(VOCABFILE).readlines()] |
31 assert len(__words) == NUMBER_OF_WORDS | |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
32 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
33 import numpy, math |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
34 from common.str import percent |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
35 |
461 | 36 __word_to_embedding = {} |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
37 |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
38 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
39 f = open(WEIGHTSFILE) |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
40 f.readline() |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
41 vals = [float(v) for v in string.split(f.readline())] |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
42 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
43 for i in range(NUMBER_OF_WORDS): |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
44 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] |
461 | 45 w = __words[i] |
46 __word_to_embedding[w] = l | |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
47 __read = True |
531
90a76a8238e8
Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents:
469
diff
changeset
|
48 for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w]) |
464 | 49 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
50 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
51 import re |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
52 numberre = re.compile("[0-9]") |
672
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
53 slashre = re.compile("\\\/") |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
54 |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
55 def preprocess_word(origw): |
469
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
56 """ |
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
57 Convert a word so that it can be embedded directly. |
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
58 Returned the preprocessed sequence. |
672
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
59 @note: Preprocessing is appropriate for Penn Treebank style documents. |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
60 #@note: Perhaps run L{common.penntreebank.preprocess} on the word first. |
469
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
61 """ |
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
62 read_embeddings() |
672
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
63 if origw == "-LRB-": w = "(" |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
64 elif origw == "-RRB-": w = ")" |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
65 elif origw == "-LCB-": w = "{" |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
66 elif origw == "-RCB-": w = "}" |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
67 elif origw == "-LSB-": w = "[" |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
68 elif origw == "-RSB-": w = "]" |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
69 else: |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
70 w = origw |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
71 if w not in __word_to_embedding: |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
72 w = string.lower(w) |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
73 w = slashre.sub("/", w) |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
74 w = numberre.sub("NUMBER", w) |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
75 # if w not in __word_to_embedding: |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
76 # w = string.lower(w) |
27b1344a57b1
Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents:
536
diff
changeset
|
77 # w = numberre.sub("NUMBER", w) |
469
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
78 if w not in __word_to_embedding: |
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
79 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) |
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
80 w = UNKNOWN |
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
81 assert w in __word_to_embedding |
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
82 return w |
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
83 |
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
84 def preprocess_seq(l): |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
85 """ |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
86 Convert a sequence so that it can be embedded directly. |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
87 Returned the preprocessed sequence. |
460 | 88 @note: Preprocessing is appropriate for Penn Treebank style documents. |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
89 """ |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
90 read_embeddings() |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
91 lnew = [] |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
92 for origw in l: |
469
4335309f4924
Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents:
468
diff
changeset
|
93 w = preprocess_word(origw) |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
94 lnew.append(w) |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
95 return lnew |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
96 |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
97 #def convert_string(s, strict=False): |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
98 # """ |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
99 # Convert a string to a sequence of embeddings. |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
100 # @param strict: If strict, then words *must* be in the vocabulary. |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
101 # @todo: DEPRECATED Remove this function. |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
102 # """ |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
103 # read_embeddings() |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
104 # e = [] |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
105 # for origw in string.split(string.lower(s)): |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
106 # w = numberre.sub("NUMBER", origw) |
461 | 107 # if w in __word_to_embedding: |
108 # e.append(__word_to_embedding[w]) | |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
109 # else: |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
110 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
111 # assert not strict |
461 | 112 # e.append(__word_to_embedding[UNKNOWN]) |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
113 # return e |
467
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
114 |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
115 #def test(): |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
116 # """ |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
117 # Debugging code. |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
118 # """ |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
119 # read_embeddings() |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
120 # for w in __word_to_embedding: |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
121 # assert len(__word_to_embedding[w]) == 50 |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
122 # import numpy |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
123 # for w1 in __words: |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
124 # e1 = numpy.asarray(__word_to_embedding[w1]) |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
125 # lst = [] |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
126 # print w1, numpy.dot(e1, e1) |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
127 # for w2 in __word_to_embedding: |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
128 # if w1 >= w2: continue |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
129 # e2 = numpy.asarray(__word_to_embedding[w2]) |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
130 # d = (e1 - e2) |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
131 # l2 = numpy.dot(d, d) |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
132 # lst.append((l2, w1, w2)) |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
133 # lst.sort() |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
134 # print lst[:10] |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
135 # |
f3711bcc467e
Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents:
464
diff
changeset
|
136 #test() |