comparison sandbox/embeddings/read.py @ 458:ed6b0b3be8d2

Polished embeddings module
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 19:13:53 -0400
parents sandbox/embeddings/original.py@131e19dfe793
children f400f62e7f9e
comparison
equal deleted inserted replaced
457:34acf8db186d 458:ed6b0b3be8d2
1 """
2 Read in the weights file
3 """
4
5 import string
6 import sys
7
8 from files import *
9
10 WORDS = None
11 WORD_TO_VECTOR = None
12 __read = False
13 def read_embeddings():
14 global WORDS
15 global WORD_TO_VECTOR
16 global __read
17 if __read: return
18
19 WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()]
20 assert len(WORDS) == NUMBER_OF_WORDS
21
22 import numpy, math
23 from common.str import percent
24
25 WORD_TO_VECTOR = {}
26
27 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
28 f = open(WEIGHTSFILE)
29 f.readline()
30 vals = [float(v) for v in string.split(f.readline())]
31 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
32 vals.reverse()
33 for i in range(NUMBER_OF_WORDS):
34 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
35 w = WORDS[i]
36 WORD_TO_VECTOR[w] = l
37 __read = True
38
39 import re
40 numberre = re.compile("[0-9]")
41 def convert_string(s):
42 """
43 Convert a string to a sequence of embeddings.
44 """
45 read_embeddings()
46 e = []
47 for origw in string.split(string.lower(s)):
48 w = numberre.sub("NUMBER", origw)
49 if w in WORD_TO_VECTOR:
50 e.append(WORD_TO_VECTOR[w])
51 else:
52 sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw))
53 return e