Mercurial > pylearn
annotate sandbox/embeddings/original.py @ 456:131e19dfe793
Added sandbox.embeddings
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 17:56:52 -0400 |
parents | |
children |
rev | line source |
---|---|
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
1 """ |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
2 Read in the weights file |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
3 """ |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
4 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
5 import string |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
6 import sys |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
7 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
8 WORDS = None |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
9 WORD_TO_VECTOR = None |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
10 def read(): |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
11 global WORDS |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
12 global WORD_TO_VECTOR |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
13 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
14 weightsfile = "lm-weights.txt" |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
15 vocabfile = "words.asc" |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
16 size = 30000 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
17 dimensions = 50 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
18 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
19 WORDS = [string.strip(w) for w in open(vocabfile).readlines()] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
20 assert len(WORDS) == 30000 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
21 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
22 import numpy, math |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
23 from common.str import percent |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
24 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
25 WORD_TO_VECTOR = {} |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
26 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
27 sys.stderr.write("Reading %s...\n" % weightsfile) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
28 f = open(weightsfile) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
29 f.readline() |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
30 vals = [float(v) for v in string.split(f.readline())] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
31 assert len(vals) == size * dimensions |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
32 vals.reverse() |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
33 for i in range(size): |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
34 l = vals[dimensions*i:dimensions*(i+1)] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
35 w = WORDS[i] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
36 WORD_TO_VECTOR[w] = l |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
37 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
38 import re |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
39 numberre = re.compile("[0-9]") |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
40 def convert_string(s): |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
41 """ |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
42 Convert a string to a sequence of embeddings. |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
43 """ |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
44 e = [] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
45 for origw in string.split(string.lower(s)): |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
46 w = numberre.sub("NUMBER", origw) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
47 if w in WORD_TO_VECTOR: |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
48 e.append(WORD_TO_VECTOR[w]) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
49 else: |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
50 sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw)) |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
51 return e |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
52 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
53 read() |