annotate sandbox/embeddings/read.py @ 459:f400f62e7f9e

Fixed embedding preprocessing
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 23:00:10 -0400
parents ed6b0b3be8d2
children fda72e944104
rev   line source
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
1 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
2 Read in the weights file
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
3 """
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
4
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
5 import string
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
6 import sys
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
7
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
8 from files import *
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
9
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
10 WORDS = None
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
11 WORD_TO_VECTOR = None
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
12 __read = False
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
13 def read_embeddings():
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
14 global WORDS
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
15 global WORD_TO_VECTOR
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
16 global __read
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
17 if __read: return
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
18
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
19 WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()]
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
20 assert len(WORDS) == NUMBER_OF_WORDS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
21
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
22 import numpy, math
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
23 from common.str import percent
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
24
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
25 WORD_TO_VECTOR = {}
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
26
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
27 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
28 f = open(WEIGHTSFILE)
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
29 f.readline()
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
30 vals = [float(v) for v in string.split(f.readline())]
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
31 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
32 vals.reverse()
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
33 for i in range(NUMBER_OF_WORDS):
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
34 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
35 w = WORDS[i]
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
36 WORD_TO_VECTOR[w] = l
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
37 __read = True
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
38
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
39 import re
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
40 numberre = re.compile("[0-9]")
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
41
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
42 def preprocess(l):
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
43 """
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
44 Convert a sequence so that it can be embedded directly.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
45 Returned the preprocessed sequence.
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
46 """
458
ed6b0b3be8d2 Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents: 456
diff changeset
47 read_embeddings()
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
48 lnew = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
49 for origw in l:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
50 if origw == "-LRB-": w = "("
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
51 elif origw == "-RRB-": w = ")"
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
52 elif origw == "-LCB-": w = "{"
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
53 elif origw == "-RCB-": w = "}"
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
54 elif origw == "-LSB-": w = "["
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
55 elif origw == "-RSB-": w = "]"
456
131e19dfe793 Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff changeset
56 else:
459
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
57 w = origw
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
58 w = string.lower(w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
59 w = numberre.sub("NUMBER", w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
60 if w not in WORD_TO_VECTOR:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
61 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
62 w = UNKNOWN
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
63 assert w in WORD_TO_VECTOR
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
64 lnew.append(w)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
65 return lnew
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
66
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
67 #def convert_string(s, strict=False):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
68 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
69 # Convert a string to a sequence of embeddings.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
70 # @param strict: If strict, then words *must* be in the vocabulary.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
71 # @todo: DEPRECATED Remove this function.
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
72 # """
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
73 # read_embeddings()
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
74 # e = []
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
75 # for origw in string.split(string.lower(s)):
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
76 # w = numberre.sub("NUMBER", origw)
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
77 # if w in WORD_TO_VECTOR:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
78 # e.append(WORD_TO_VECTOR[w])
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
79 # else:
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
80 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
81 # assert not strict
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
82 # e.append(WORD_TO_VECTOR[UNKNOWN])
f400f62e7f9e Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents: 458
diff changeset
83 # return e