annotate embeddings/process.py @ 498:2be795cc5c3a

More documentation + todo
author Joseph Turian <turian@gmail.com>
date Tue, 28 Oct 2008 12:25:15 -0400
parents 4335309f4924
children 90a76a8238e8
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2 Read in the weights file
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
8 from parameters import *
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
9
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
10 __words = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
11 __word_to_embedding = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
12 __read = False
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
13
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
14 def word_to_embedding(w):
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
15 read_embeddings()
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
16 return __word_to_embedding[w]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
17
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
18 def read_embeddings():
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
19 global __words
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
20 global __word_to_embedding
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
21 global __read
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
22 if __read: return
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
23
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
24 __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
25 assert len(__words) == NUMBER_OF_WORDS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
26
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
27 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
28 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
29
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
30 __word_to_embedding = {}
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
31
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
32 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
33 f = open(WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
34 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
35 vals = [float(v) for v in string.split(f.readline())]
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
36 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
37 for i in range(NUMBER_OF_WORDS):
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
38 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
39 w = __words[i]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
40 __word_to_embedding[w] = l
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
41 __read = True
464
121cc6db4481 More debug output
Joseph Turian <turian@iro.umontreal.ca>
parents: 461
diff changeset
42 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
43
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
44 import re
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
45 numberre = re.compile("[0-9]")
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
46 slashre = re.compile("\\\/")
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
47
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
48 def preprocess_word(origw):
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
49 """
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
50 Convert a word so that it can be embedded directly.
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
51 Returned the preprocessed sequence.
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
52 @note: Preprocessing is appropriate for Penn Treebank style documents.
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
53 """
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
54 read_embeddings()
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
55 if origw == "-LRB-": w = "("
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
56 elif origw == "-RRB-": w = ")"
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
57 elif origw == "-LCB-": w = "{"
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
58 elif origw == "-RCB-": w = "}"
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
59 elif origw == "-LSB-": w = "["
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
60 elif origw == "-RSB-": w = "]"
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
61 else:
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
62 w = origw
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
63 w = string.lower(w)
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
64 w = slashre.sub("/", w)
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
65 w = numberre.sub("NUMBER", w)
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
66 if w not in __word_to_embedding:
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
67 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
68 w = UNKNOWN
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
69 assert w in __word_to_embedding
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
70 return w
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
71
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
72 def preprocess_seq(l):
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
73 """
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
74 Convert a sequence so that it can be embedded directly.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
75 Returned the preprocessed sequence.
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
76 @note: Preprocessing is appropriate for Penn Treebank style documents.
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
77 """
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
78 read_embeddings()
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
79 lnew = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
80 for origw in l:
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
81 w = preprocess_word(origw)
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
82 lnew.append(w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
83 return lnew
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
84
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
85 #def convert_string(s, strict=False):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
86 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
87 # Convert a string to a sequence of embeddings.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
88 # @param strict: If strict, then words *must* be in the vocabulary.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
89 # @todo: DEPRECATED Remove this function.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
90 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
91 # read_embeddings()
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
92 # e = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
93 # for origw in string.split(string.lower(s)):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
94 # w = numberre.sub("NUMBER", origw)
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
95 # if w in __word_to_embedding:
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
96 # e.append(__word_to_embedding[w])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
97 # else:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
98 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
99 # assert not strict
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
100 # e.append(__word_to_embedding[UNKNOWN])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
101 # return e
467
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
102
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
103 #def test():
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
104 # """
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
105 # Debugging code.
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
106 # """
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
107 # read_embeddings()
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
108 # for w in __word_to_embedding:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
109 # assert len(__word_to_embedding[w]) == 50
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
110 # import numpy
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
111 # for w1 in __words:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
112 # e1 = numpy.asarray(__word_to_embedding[w1])
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
113 # lst = []
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
114 # print w1, numpy.dot(e1, e1)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
115 # for w2 in __word_to_embedding:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
116 # if w1 >= w2: continue
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
117 # e2 = numpy.asarray(__word_to_embedding[w2])
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
118 # d = (e1 - e2)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
119 # l2 = numpy.dot(d, d)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
120 # lst.append((l2, w1, w2))
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
121 # lst.sort()
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
122 # print lst[:10]
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
123 #
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
124 #test()