Mercurial > pylearn
comparison sandbox/embeddings/read.py @ 458:ed6b0b3be8d2
Polished embeddings module
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 19:13:53 -0400 |
parents | sandbox/embeddings/original.py@131e19dfe793 |
children | f400f62e7f9e |
comparison
equal
deleted
inserted
replaced
457:34acf8db186d | 458:ed6b0b3be8d2 |
---|---|
1 """ | |
2 Read in the weights file | |
3 """ | |
4 | |
5 import string | |
6 import sys | |
7 | |
8 from files import * | |
9 | |
10 WORDS = None | |
11 WORD_TO_VECTOR = None | |
12 __read = False | |
13 def read_embeddings(): | |
14 global WORDS | |
15 global WORD_TO_VECTOR | |
16 global __read | |
17 if __read: return | |
18 | |
19 WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()] | |
20 assert len(WORDS) == NUMBER_OF_WORDS | |
21 | |
22 import numpy, math | |
23 from common.str import percent | |
24 | |
25 WORD_TO_VECTOR = {} | |
26 | |
27 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) | |
28 f = open(WEIGHTSFILE) | |
29 f.readline() | |
30 vals = [float(v) for v in string.split(f.readline())] | |
31 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS | |
32 vals.reverse() | |
33 for i in range(NUMBER_OF_WORDS): | |
34 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] | |
35 w = WORDS[i] | |
36 WORD_TO_VECTOR[w] = l | |
37 __read = True | |
38 | |
39 import re | |
40 numberre = re.compile("[0-9]") | |
41 def convert_string(s): | |
42 """ | |
43 Convert a string to a sequence of embeddings. | |
44 """ | |
45 read_embeddings() | |
46 e = [] | |
47 for origw in string.split(string.lower(s)): | |
48 w = numberre.sub("NUMBER", origw) | |
49 if w in WORD_TO_VECTOR: | |
50 e.append(WORD_TO_VECTOR[w]) | |
51 else: | |
52 sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) | |
53 return e |