annotate sandbox/embeddings/process.py @ 461:1243716ade6a

Rearranged
author Joseph Turian <turian@iro.umontreal.ca>
date Wed, 08 Oct 2008 01:09:23 -0400
parents sandbox/embeddings/read.py@fda72e944104
children 121cc6db4481
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2 Read in the weights file
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
8 from parameters import *
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
9
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
10 __words = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
11 __word_to_embedding = None
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
12 __read = False
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
13
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
14 def word_to_embedding(w):
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
15 read_embeddings()
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
16 return __word_to_embedding[w]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
17
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
18 def read_embeddings():
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
19 global __words
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
20 global __word_to_embedding
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
21 global __read
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
22 if __read: return
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
23
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
24 __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
25 assert len(__words) == NUMBER_OF_WORDS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
26
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
27 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
28 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
29
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
30 __word_to_embedding = {}
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
31
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
32 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
33 f = open(WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
34 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
35 vals = [float(v) for v in string.split(f.readline())]
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
36 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
37 vals.reverse()
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
38 for i in range(NUMBER_OF_WORDS):
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
39 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
40 w = __words[i]
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
41 __word_to_embedding[w] = l
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
42 __read = True
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
43
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
44 import re
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
45 numberre = re.compile("[0-9]")
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
46 slashre = re.compile("\\\/")
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
47
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
48 def preprocess(l):
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
49 """
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
50 Convert a sequence so that it can be embedded directly.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
51 Returned the preprocessed sequence.
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
52 @note: Preprocessing is appropriate for Penn Treebank style documents.
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
53 """
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
54 read_embeddings()
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
55 lnew = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
56 for origw in l:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
57 if origw == "-LRB-": w = "("
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
58 elif origw == "-RRB-": w = ")"
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
59 elif origw == "-LCB-": w = "{"
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
60 elif origw == "-RCB-": w = "}"
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
61 elif origw == "-LSB-": w = "["
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
62 elif origw == "-RSB-": w = "]"
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
63 else:
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
64 w = origw
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
65 w = string.lower(w)
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
66 w = slashre.sub("/", w)
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
67 w = numberre.sub("NUMBER", w)
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
68 if w not in __word_to_embedding:
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
69 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
70 w = UNKNOWN
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
71 assert w in __word_to_embedding
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
72 lnew.append(w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
73 return lnew
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
74
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
75 #def convert_string(s, strict=False):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
76 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
77 # Convert a string to a sequence of embeddings.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
78 # @param strict: If strict, then words *must* be in the vocabulary.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
79 # @todo: DEPRECATED Remove this function.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
80 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
81 # read_embeddings()
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
82 # e = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
83 # for origw in string.split(string.lower(s)):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
84 # w = numberre.sub("NUMBER", origw)
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
85 # if w in __word_to_embedding:
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
86 # e.append(__word_to_embedding[w])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
87 # else:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
88 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
89 # assert not strict
461
1243716ade6a Rearranged
Joseph Turian <turian@iro.umontreal.ca>
parents: 460
diff changeset
90 # e.append(__word_to_embedding[UNKNOWN])
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
91 # return e