Mercurial > pylearn
comparison embeddings/process.py @ 468:a07948f780b9
Moved embeddings out of sandbox
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 21 Oct 2008 16:24:44 -0400 |
parents | sandbox/embeddings/process.py@f3711bcc467e |
children | 4335309f4924 |
comparison
equal
deleted
inserted
replaced
467:f3711bcc467e | 468:a07948f780b9 |
---|---|
1 """ | |
2 Read in the weights file | |
3 """ | |
4 | |
5 import string | |
6 import sys | |
7 | |
8 from parameters import * | |
9 | |
10 __words = None | |
11 __word_to_embedding = None | |
12 __read = False | |
13 | |
14 def word_to_embedding(w): | |
15 read_embeddings() | |
16 return __word_to_embedding[w] | |
17 | |
18 def read_embeddings(): | |
19 global __words | |
20 global __word_to_embedding | |
21 global __read | |
22 if __read: return | |
23 | |
24 __words = [string.strip(w) for w in open(VOCABFILE).readlines()] | |
25 assert len(__words) == NUMBER_OF_WORDS | |
26 | |
27 import numpy, math | |
28 from common.str import percent | |
29 | |
30 __word_to_embedding = {} | |
31 | |
32 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) | |
33 f = open(WEIGHTSFILE) | |
34 f.readline() | |
35 vals = [float(v) for v in string.split(f.readline())] | |
36 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS | |
37 for i in range(NUMBER_OF_WORDS): | |
38 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] | |
39 w = __words[i] | |
40 __word_to_embedding[w] = l | |
41 __read = True | |
42 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) | |
43 | |
44 import re | |
45 numberre = re.compile("[0-9]") | |
46 slashre = re.compile("\\\/") | |
47 | |
48 def preprocess(l): | |
49 """ | |
50 Convert a sequence so that it can be embedded directly. | |
51 Returned the preprocessed sequence. | |
52 @note: Preprocessing is appropriate for Penn Treebank style documents. | |
53 """ | |
54 read_embeddings() | |
55 lnew = [] | |
56 for origw in l: | |
57 if origw == "-LRB-": w = "(" | |
58 elif origw == "-RRB-": w = ")" | |
59 elif origw == "-LCB-": w = "{" | |
60 elif origw == "-RCB-": w = "}" | |
61 elif origw == "-LSB-": w = "[" | |
62 elif origw == "-RSB-": w = "]" | |
63 else: | |
64 w = origw | |
65 w = string.lower(w) | |
66 w = slashre.sub("/", w) | |
67 w = numberre.sub("NUMBER", w) | |
68 if w not in __word_to_embedding: | |
69 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) | |
70 w = UNKNOWN | |
71 assert w in __word_to_embedding | |
72 lnew.append(w) | |
73 return lnew | |
74 | |
75 #def convert_string(s, strict=False): | |
76 # """ | |
77 # Convert a string to a sequence of embeddings. | |
78 # @param strict: If strict, then words *must* be in the vocabulary. | |
79 # @todo: DEPRECATED Remove this function. | |
80 # """ | |
81 # read_embeddings() | |
82 # e = [] | |
83 # for origw in string.split(string.lower(s)): | |
84 # w = numberre.sub("NUMBER", origw) | |
85 # if w in __word_to_embedding: | |
86 # e.append(__word_to_embedding[w]) | |
87 # else: | |
88 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) | |
89 # assert not strict | |
90 # e.append(__word_to_embedding[UNKNOWN]) | |
91 # return e | |
92 | |
93 #def test(): | |
94 # """ | |
95 # Debugging code. | |
96 # """ | |
97 # read_embeddings() | |
98 # for w in __word_to_embedding: | |
99 # assert len(__word_to_embedding[w]) == 50 | |
100 # import numpy | |
101 # for w1 in __words: | |
102 # e1 = numpy.asarray(__word_to_embedding[w1]) | |
103 # lst = [] | |
104 # print w1, numpy.dot(e1, e1) | |
105 # for w2 in __word_to_embedding: | |
106 # if w1 >= w2: continue | |
107 # e2 = numpy.asarray(__word_to_embedding[w2]) | |
108 # d = (e1 - e2) | |
109 # l2 = numpy.dot(d, d) | |
110 # lst.append((l2, w1, w2)) | |
111 # lst.sort() | |
112 # print lst[:10] | |
113 # | |
114 #test() |