annotate embeddings/process.py @ 672:27b1344a57b1

Added preprocessing back in
author Joseph Turian <turian@gmail.com>
date Thu, 20 Nov 2008 06:38:06 -0500
parents c6563c629984
children
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2 Read in the weights file
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
8 from parameters import *
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
9
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
10 __words = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
11 __word_to_embedding = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
12 __read = False
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
13
531
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
14 def length():
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
15 """
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
16 @return: The length of embeddings
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
17 """
534
eaa5ad4089a1 Another bugfix in pylearn.embeddings.length()
Joseph Turian <turian@gmail.com>
parents: 533
diff changeset
18 return len(__word_to_embedding[__words[0]])
531
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
19
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
20 def word_to_embedding(w):
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
21 read_embeddings()
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
22 return __word_to_embedding[w]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
23
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
24 def read_embeddings():
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
25 global __words
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
26 global __word_to_embedding
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
27 global __read
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
28 if __read: return
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
29
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
30 __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
31 assert len(__words) == NUMBER_OF_WORDS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
32
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
33 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
34 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
35
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
36 __word_to_embedding = {}
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
37
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
38 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
39 f = open(WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
40 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
41 vals = [float(v) for v in string.split(f.readline())]
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
42 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
43 for i in range(NUMBER_OF_WORDS):
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
44 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
45 w = __words[i]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
46 __word_to_embedding[w] = l
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
47 __read = True
531
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
48 for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w])
464
121cc6db4481 More debug output
Joseph Turian <turian@iro.umontreal.ca>
parents: 461
diff changeset
49 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
50
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
51 import re
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
52 numberre = re.compile("[0-9]")
672
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
53 slashre = re.compile("\\\/")
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
54
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
55 def preprocess_word(origw):
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
56 """
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
57 Convert a word so that it can be embedded directly.
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
58 Returned the preprocessed sequence.
672
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
59 @note: Preprocessing is appropriate for Penn Treebank style documents.
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
60 #@note: Perhaps run L{common.penntreebank.preprocess} on the word first.
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
61 """
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
62 read_embeddings()
672
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
63 if origw == "-LRB-": w = "("
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
64 elif origw == "-RRB-": w = ")"
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
65 elif origw == "-LCB-": w = "{"
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
66 elif origw == "-RCB-": w = "}"
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
67 elif origw == "-LSB-": w = "["
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
68 elif origw == "-RSB-": w = "]"
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
69 else:
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
70 w = origw
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
71 if w not in __word_to_embedding:
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
72 w = string.lower(w)
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
73 w = slashre.sub("/", w)
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
74 w = numberre.sub("NUMBER", w)
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
75 # if w not in __word_to_embedding:
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
76 # w = string.lower(w)
27b1344a57b1 Added preprocessing back in
Joseph Turian <turian@gmail.com>
parents: 536
diff changeset
77 # w = numberre.sub("NUMBER", w)
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
78 if w not in __word_to_embedding:
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
79 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
80 w = UNKNOWN
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
81 assert w in __word_to_embedding
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
82 return w
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
83
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
84 def preprocess_seq(l):
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
85 """
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
86 Convert a sequence so that it can be embedded directly.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
87 Returned the preprocessed sequence.
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
88 @note: Preprocessing is appropriate for Penn Treebank style documents.
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
89 """
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
90 read_embeddings()
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
91 lnew = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
92 for origw in l:
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
93 w = preprocess_word(origw)
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
94 lnew.append(w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
95 return lnew
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
96
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
97 #def convert_string(s, strict=False):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
98 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
99 # Convert a string to a sequence of embeddings.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
100 # @param strict: If strict, then words *must* be in the vocabulary.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
101 # @todo: DEPRECATED Remove this function.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
102 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
103 # read_embeddings()
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
104 # e = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
105 # for origw in string.split(string.lower(s)):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
106 # w = numberre.sub("NUMBER", origw)
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
107 # if w in __word_to_embedding:
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
108 # e.append(__word_to_embedding[w])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
109 # else:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
110 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
111 # assert not strict
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
112 # e.append(__word_to_embedding[UNKNOWN])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
113 # return e
467
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
114
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
115 #def test():
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
116 # """
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
117 # Debugging code.
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
118 # """
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
119 # read_embeddings()
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
120 # for w in __word_to_embedding:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
121 # assert len(__word_to_embedding[w]) == 50
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
122 # import numpy
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
123 # for w1 in __words:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
124 # e1 = numpy.asarray(__word_to_embedding[w1])
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
125 # lst = []
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
126 # print w1, numpy.dot(e1, e1)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
127 # for w2 in __word_to_embedding:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
128 # if w1 >= w2: continue
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
129 # e2 = numpy.asarray(__word_to_embedding[w2])
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
130 # d = (e1 - e2)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
131 # l2 = numpy.dot(d, d)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
132 # lst.append((l2, w1, w2))
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
133 # lst.sort()
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
134 # print lst[:10]
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
135 #
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
136 #test()