annotate embeddings/process.py @ 532:34ee3aff3e8f

Improved embedding word preprocessing.
author Joseph Turian <turian@gmail.com>
date Tue, 18 Nov 2008 02:57:50 -0500
parents 90a76a8238e8
children de974b4fc4ea
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2 Read in the weights file
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
8 from parameters import *
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
9
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
10 __words = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
11 __word_to_embedding = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
12 __read = False
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
13
531
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
14 def length():
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
15 """
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
16 @return: The length of embeddings
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
17 """
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
18 len(__word_to_embedding[0])
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
19
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
20 def word_to_embedding(w):
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
21 read_embeddings()
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
22 return __word_to_embedding[w]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
23
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
24 def read_embeddings():
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
25 global __words
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
26 global __word_to_embedding
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
27 global __read
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
28 if __read: return
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
29
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
30 __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
31 assert len(__words) == NUMBER_OF_WORDS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
32
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
33 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
34 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
35
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
36 __word_to_embedding = {}
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
37
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
38 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
39 f = open(WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
40 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
41 vals = [float(v) for v in string.split(f.readline())]
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
42 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
43 for i in range(NUMBER_OF_WORDS):
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
44 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
45 w = __words[i]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
46 __word_to_embedding[w] = l
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
47 __read = True
531
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
48 for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w])
464
121cc6db4481 More debug output
Joseph Turian <turian@iro.umontreal.ca>
parents: 461
diff changeset
49 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
50
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
51 import re
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
52 numberre = re.compile("[0-9]")
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
53 slashre = re.compile("\\\/")
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
54
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
55 def preprocess_word(origw):
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
56 """
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
57 Convert a word so that it can be embedded directly.
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
58 Returned the preprocessed sequence.
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
59 @note: Preprocessing is appropriate for Penn Treebank style documents.
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
60 """
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
61 read_embeddings()
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
62 if origw == "-LRB-": w = "("
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
63 elif origw == "-RRB-": w = ")"
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
64 elif origw == "-LCB-": w = "{"
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
65 elif origw == "-RCB-": w = "}"
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
66 elif origw == "-LSB-": w = "["
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
67 elif origw == "-RSB-": w = "]"
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
68 else:
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
69 w = origw
532
34ee3aff3e8f Improved embedding word preprocessing.
Joseph Turian <turian@gmail.com>
parents: 531
diff changeset
70 if w not in __word_to_embedding:
34ee3aff3e8f Improved embedding word preprocessing.
Joseph Turian <turian@gmail.com>
parents: 531
diff changeset
71 w = string.lower(w)
34ee3aff3e8f Improved embedding word preprocessing.
Joseph Turian <turian@gmail.com>
parents: 531
diff changeset
72 w = slashre.sub("/", w)
34ee3aff3e8f Improved embedding word preprocessing.
Joseph Turian <turian@gmail.com>
parents: 531
diff changeset
73 w = numberre.sub("NUMBER", w)
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
74 if w not in __word_to_embedding:
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
75 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
76 w = UNKNOWN
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
77 assert w in __word_to_embedding
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
78 return w
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
79
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
80 def preprocess_seq(l):
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
81 """
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
82 Convert a sequence so that it can be embedded directly.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
83 Returned the preprocessed sequence.
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
84 @note: Preprocessing is appropriate for Penn Treebank style documents.
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
85 """
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
86 read_embeddings()
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
87 lnew = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
88 for origw in l:
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
89 w = preprocess_word(origw)
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
90 lnew.append(w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
91 return lnew
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
92
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
93 #def convert_string(s, strict=False):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
94 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
95 # Convert a string to a sequence of embeddings.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
96 # @param strict: If strict, then words *must* be in the vocabulary.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
97 # @todo: DEPRECATED Remove this function.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
98 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
99 # read_embeddings()
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
100 # e = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
101 # for origw in string.split(string.lower(s)):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
102 # w = numberre.sub("NUMBER", origw)
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
103 # if w in __word_to_embedding:
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
104 # e.append(__word_to_embedding[w])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
105 # else:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
106 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
107 # assert not strict
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
108 # e.append(__word_to_embedding[UNKNOWN])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
109 # return e
467
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
110
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
111 #def test():
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
112 # """
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
113 # Debugging code.
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
114 # """
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
115 # read_embeddings()
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
116 # for w in __word_to_embedding:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
117 # assert len(__word_to_embedding[w]) == 50
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
118 # import numpy
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
119 # for w1 in __words:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
120 # e1 = numpy.asarray(__word_to_embedding[w1])
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
121 # lst = []
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
122 # print w1, numpy.dot(e1, e1)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
123 # for w2 in __word_to_embedding:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
124 # if w1 >= w2: continue
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
125 # e2 = numpy.asarray(__word_to_embedding[w2])
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
126 # d = (e1 - e2)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
127 # l2 = numpy.dot(d, d)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
128 # lst.append((l2, w1, w2))
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
129 # lst.sort()
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
130 # print lst[:10]
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
131 #
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
132 #test()