annotate embeddings/process.py @ 536:c6563c629984

Moved word preprocessing out
author Joseph Turian <turian@gmail.com>
date Thu, 20 Nov 2008 06:11:52 -0500
parents eaa5ad4089a1
children 27b1344a57b1
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2 Read in the weights file
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
8 from parameters import *
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
9
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
10 __words = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
11 __word_to_embedding = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
12 __read = False
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
13
531
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
14 def length():
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
15 """
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
16 @return: The length of embeddings
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
17 """
534
eaa5ad4089a1 Another bugfix in pylearn.embeddings.length()
Joseph Turian <turian@gmail.com>
parents: 533
diff changeset
18 return len(__word_to_embedding[__words[0]])
531
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
19
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
20 def word_to_embedding(w):
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
21 read_embeddings()
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
22 return __word_to_embedding[w]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
23
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
24 def read_embeddings():
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
25 global __words
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
26 global __word_to_embedding
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
27 global __read
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
28 if __read: return
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
29
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
30 __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
31 assert len(__words) == NUMBER_OF_WORDS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
32
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
33 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
34 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
35
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
36 __word_to_embedding = {}
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
37
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
38 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
39 f = open(WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
40 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
41 vals = [float(v) for v in string.split(f.readline())]
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
42 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
43 for i in range(NUMBER_OF_WORDS):
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
44 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
45 w = __words[i]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
46 __word_to_embedding[w] = l
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
47 __read = True
531
90a76a8238e8 Added function length()
Joseph Turian <turian@iro.umontreal.ca>
parents: 469
diff changeset
48 for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w])
464
121cc6db4481 More debug output
Joseph Turian <turian@iro.umontreal.ca>
parents: 461
diff changeset
49 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
50
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
51 import re
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
52 numberre = re.compile("[0-9]")
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
53
536
c6563c629984 Moved word preprocessing out
Joseph Turian <turian@gmail.com>
parents: 534
diff changeset
54 def preprocess_word(w):
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
55 """
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
56 Convert a word so that it can be embedded directly.
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
57 Returned the preprocessed sequence.
536
c6563c629984 Moved word preprocessing out
Joseph Turian <turian@gmail.com>
parents: 534
diff changeset
58 @note: Perhaps run L{common.penntreebank.preprocess} on the word first.
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
59 """
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
60 read_embeddings()
536
c6563c629984 Moved word preprocessing out
Joseph Turian <turian@gmail.com>
parents: 534
diff changeset
61 if w not in __word_to_embedding:
c6563c629984 Moved word preprocessing out
Joseph Turian <turian@gmail.com>
parents: 534
diff changeset
62 w = string.lower(w)
c6563c629984 Moved word preprocessing out
Joseph Turian <turian@gmail.com>
parents: 534
diff changeset
63 w = numberre.sub("NUMBER", w)
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
64 if w not in __word_to_embedding:
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
65 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
66 w = UNKNOWN
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
67 assert w in __word_to_embedding
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
68 return w
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
69
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
70 def preprocess_seq(l):
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
71 """
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
72 Convert a sequence so that it can be embedded directly.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
73 Returned the preprocessed sequence.
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
74 @note: Preprocessing is appropriate for Penn Treebank style documents.
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
75 """
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
76 read_embeddings()
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
77 lnew = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
78 for origw in l:
469
4335309f4924 Split into preprocess for words and sequences
Joseph Turian <turian@iro.umontreal.ca>
parents: 468
diff changeset
79 w = preprocess_word(origw)
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
80 lnew.append(w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
81 return lnew
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
82
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
83 #def convert_string(s, strict=False):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
84 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
85 # Convert a string to a sequence of embeddings.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
86 # @param strict: If strict, then words *must* be in the vocabulary.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
87 # @todo: DEPRECATED Remove this function.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
88 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
89 # read_embeddings()
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
90 # e = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
91 # for origw in string.split(string.lower(s)):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
92 # w = numberre.sub("NUMBER", origw)
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
93 # if w in __word_to_embedding:
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
94 # e.append(__word_to_embedding[w])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
95 # else:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
96 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
97 # assert not strict
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
98 # e.append(__word_to_embedding[UNKNOWN])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
99 # return e
467
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
100
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
101 #def test():
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
102 # """
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
103 # Debugging code.
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
104 # """
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
105 # read_embeddings()
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
106 # for w in __word_to_embedding:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
107 # assert len(__word_to_embedding[w]) == 50
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
108 # import numpy
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
109 # for w1 in __words:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
110 # e1 = numpy.asarray(__word_to_embedding[w1])
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
111 # lst = []
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
112 # print w1, numpy.dot(e1, e1)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
113 # for w2 in __word_to_embedding:
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
114 # if w1 >= w2: continue
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
115 # e2 = numpy.asarray(__word_to_embedding[w2])
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
116 # d = (e1 - e2)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
117 # l2 = numpy.dot(d, d)
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
118 # lst.append((l2, w1, w2))
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
119 # lst.sort()
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
120 # print lst[:10]
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
121 #
f3711bcc467e Fixed a bug in how embeddings are read
Joseph Turian <turian@iro.umontreal.ca>
parents: 464
diff changeset
122 #test()