annotate sandbox/embeddings/read.py @ 460:fda72e944104

\/ -> /
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 23:07:23 -0400
parents f400f62e7f9e
children
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2 Read in the weights file
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
8 from files import *
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
9
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
10 WORDS = None
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
11 WORD_TO_VECTOR = None
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
12 __read = False
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
13 def read_embeddings():
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
14 global WORDS
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
15 global WORD_TO_VECTOR
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
16 global __read
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
17 if __read: return
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
18
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
19 WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()]
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
20 assert len(WORDS) == NUMBER_OF_WORDS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
21
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
22 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
23 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
24
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
25 WORD_TO_VECTOR = {}
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
26
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
27 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
28 f = open(WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
29 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
30 vals = [float(v) for v in string.split(f.readline())]
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
31 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
32 vals.reverse()
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
33 for i in range(NUMBER_OF_WORDS):
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
34 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
35 w = WORDS[i]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
36 WORD_TO_VECTOR[w] = l
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
37 __read = True
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
38
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
39 import re
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
40 numberre = re.compile("[0-9]")
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
41 slashre = re.compile("\\\/")
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
42
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
43 def preprocess(l):
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
44 """
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
45 Convert a sequence so that it can be embedded directly.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
46 Returned the preprocessed sequence.
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
47 @note: Preprocessing is appropriate for Penn Treebank style documents.
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
48 """
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
49 read_embeddings()
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
50 lnew = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
51 for origw in l:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
52 if origw == "-LRB-": w = "("
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
53 elif origw == "-RRB-": w = ")"
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
54 elif origw == "-LCB-": w = "{"
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
55 elif origw == "-RCB-": w = "}"
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
56 elif origw == "-LSB-": w = "["
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
57 elif origw == "-RSB-": w = "]"
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
58 else:
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
59 w = origw
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
60 w = string.lower(w)
460
fda72e944104 \/ -> /
Joseph Turian <turian@iro.umontreal.ca>
parents: 459
diff changeset
61 w = slashre.sub("/", w)
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
62 w = numberre.sub("NUMBER", w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
63 if w not in WORD_TO_VECTOR:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
64 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
65 w = UNKNOWN
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
66 assert w in WORD_TO_VECTOR
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
67 lnew.append(w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
68 return lnew
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
69
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
70 #def convert_string(s, strict=False):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
71 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
72 # Convert a string to a sequence of embeddings.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
73 # @param strict: If strict, then words *must* be in the vocabulary.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
74 # @todo: DEPRECATED Remove this function.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
75 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
76 # read_embeddings()
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
77 # e = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
78 # for origw in string.split(string.lower(s)):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
79 # w = numberre.sub("NUMBER", origw)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
80 # if w in WORD_TO_VECTOR:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
81 # e.append(WORD_TO_VECTOR[w])
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
82 # else:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
83 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
84 # assert not strict
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
85 # e.append(WORD_TO_VECTOR[UNKNOWN])
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
86 # return e