comparison embeddings/process.py @ 468:a07948f780b9

Moved embeddings out of sandbox
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 21 Oct 2008 16:24:44 -0400
parents sandbox/embeddings/process.py@f3711bcc467e
children 4335309f4924
comparison
equal deleted inserted replaced
467:f3711bcc467e 468:a07948f780b9
1 """
2 Read in the weights file
3 """
4
5 import string
6 import sys
7
8 from parameters import *
9
10 __words = None
11 __word_to_embedding = None
12 __read = False
13
14 def word_to_embedding(w):
15 read_embeddings()
16 return __word_to_embedding[w]
17
18 def read_embeddings():
19 global __words
20 global __word_to_embedding
21 global __read
22 if __read: return
23
24 __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
25 assert len(__words) == NUMBER_OF_WORDS
26
27 import numpy, math
28 from common.str import percent
29
30 __word_to_embedding = {}
31
32 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
33 f = open(WEIGHTSFILE)
34 f.readline()
35 vals = [float(v) for v in string.split(f.readline())]
36 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
37 for i in range(NUMBER_OF_WORDS):
38 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
39 w = __words[i]
40 __word_to_embedding[w] = l
41 __read = True
42 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)
43
44 import re
45 numberre = re.compile("[0-9]")
46 slashre = re.compile("\\\/")
47
48 def preprocess(l):
49 """
50 Convert a sequence so that it can be embedded directly.
51 Returned the preprocessed sequence.
52 @note: Preprocessing is appropriate for Penn Treebank style documents.
53 """
54 read_embeddings()
55 lnew = []
56 for origw in l:
57 if origw == "-LRB-": w = "("
58 elif origw == "-RRB-": w = ")"
59 elif origw == "-LCB-": w = "{"
60 elif origw == "-RCB-": w = "}"
61 elif origw == "-LSB-": w = "["
62 elif origw == "-RSB-": w = "]"
63 else:
64 w = origw
65 w = string.lower(w)
66 w = slashre.sub("/", w)
67 w = numberre.sub("NUMBER", w)
68 if w not in __word_to_embedding:
69 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
70 w = UNKNOWN
71 assert w in __word_to_embedding
72 lnew.append(w)
73 return lnew
74
75 #def convert_string(s, strict=False):
76 # """
77 # Convert a string to a sequence of embeddings.
78 # @param strict: If strict, then words *must* be in the vocabulary.
79 # @todo: DEPRECATED Remove this function.
80 # """
81 # read_embeddings()
82 # e = []
83 # for origw in string.split(string.lower(s)):
84 # w = numberre.sub("NUMBER", origw)
85 # if w in __word_to_embedding:
86 # e.append(__word_to_embedding[w])
87 # else:
88 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
89 # assert not strict
90 # e.append(__word_to_embedding[UNKNOWN])
91 # return e
92
93 #def test():
94 # """
95 # Debugging code.
96 # """
97 # read_embeddings()
98 # for w in __word_to_embedding:
99 # assert len(__word_to_embedding[w]) == 50
100 # import numpy
101 # for w1 in __words:
102 # e1 = numpy.asarray(__word_to_embedding[w1])
103 # lst = []
104 # print w1, numpy.dot(e1, e1)
105 # for w2 in __word_to_embedding:
106 # if w1 >= w2: continue
107 # e2 = numpy.asarray(__word_to_embedding[w2])
108 # d = (e1 - e2)
109 # l2 = numpy.dot(d, d)
110 # lst.append((l2, w1, w2))
111 # lst.sort()
112 # print lst[:10]
113 #
114 #test()