Mercurial > pylearn
annotate sandbox/embeddings/read.py @ 460:fda72e944104
\/ -> /
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 23:07:23 -0400 |
parents | f400f62e7f9e |
children |
rev | line source |
---|---|
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
1 """ |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
2 Read in the weights file |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
3 """ |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
4 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
5 import string |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
6 import sys |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
7 |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
8 from files import * |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
9 |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
10 WORDS = None |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
11 WORD_TO_VECTOR = None |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
12 __read = False |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
13 def read_embeddings(): |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
14 global WORDS |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
15 global WORD_TO_VECTOR |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
16 global __read |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
17 if __read: return |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
18 |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
19 WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()] |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
20 assert len(WORDS) == NUMBER_OF_WORDS |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
21 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
22 import numpy, math |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
23 from common.str import percent |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
24 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
25 WORD_TO_VECTOR = {} |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
26 |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
27 sys.stderr.write("Reading %s...\n" % WEIGHTSFILE) |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
28 f = open(WEIGHTSFILE) |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
29 f.readline() |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
30 vals = [float(v) for v in string.split(f.readline())] |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
31 assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
32 vals.reverse() |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
33 for i in range(NUMBER_OF_WORDS): |
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
34 l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)] |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
35 w = WORDS[i] |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
36 WORD_TO_VECTOR[w] = l |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
37 __read = True |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
38 |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
39 import re |
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
40 numberre = re.compile("[0-9]") |
460 | 41 slashre = re.compile("\\\/") |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
42 |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
43 def preprocess(l): |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
44 """ |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
45 Convert a sequence so that it can be embedded directly. |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
46 Returned the preprocessed sequence. |
460 | 47 @note: Preprocessing is appropriate for Penn Treebank style documents. |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
48 """ |
458
ed6b0b3be8d2
Polished embeddings module
Joseph Turian <turian@iro.umontreal.ca>
parents:
456
diff
changeset
|
49 read_embeddings() |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
50 lnew = [] |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
51 for origw in l: |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
52 if origw == "-LRB-": w = "(" |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
53 elif origw == "-RRB-": w = ")" |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
54 elif origw == "-LCB-": w = "{" |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
55 elif origw == "-RCB-": w = "}" |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
56 elif origw == "-LSB-": w = "[" |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
57 elif origw == "-RSB-": w = "]" |
456
131e19dfe793
Added sandbox.embeddings
Joseph Turian <turian@iro.umontreal.ca>
parents:
diff
changeset
|
58 else: |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
59 w = origw |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
60 w = string.lower(w) |
460 | 61 w = slashre.sub("/", w) |
459
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
62 w = numberre.sub("NUMBER", w) |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
63 if w not in WORD_TO_VECTOR: |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
64 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
65 w = UNKNOWN |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
66 assert w in WORD_TO_VECTOR |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
67 lnew.append(w) |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
68 return lnew |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
69 |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
70 #def convert_string(s, strict=False): |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
71 # """ |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
72 # Convert a string to a sequence of embeddings. |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
73 # @param strict: If strict, then words *must* be in the vocabulary. |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
74 # @todo: DEPRECATED Remove this function. |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
75 # """ |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
76 # read_embeddings() |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
77 # e = [] |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
78 # for origw in string.split(string.lower(s)): |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
79 # w = numberre.sub("NUMBER", origw) |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
80 # if w in WORD_TO_VECTOR: |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
81 # e.append(WORD_TO_VECTOR[w]) |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
82 # else: |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
83 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
84 # assert not strict |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
85 # e.append(WORD_TO_VECTOR[UNKNOWN]) |
f400f62e7f9e
Fixed embedding preprocessing
Joseph Turian <turian@iro.umontreal.ca>
parents:
458
diff
changeset
|
86 # return e |