Mercurial > pylearn
comparison sandbox/embeddings/read.py @ 460:fda72e944104
\/ -> /
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Tue, 07 Oct 2008 23:07:23 -0400 |
parents | f400f62e7f9e |
children |
comparison
equal
deleted
inserted
replaced
459:f400f62e7f9e | 460:fda72e944104 |
---|---|
36 WORD_TO_VECTOR[w] = l | 36 WORD_TO_VECTOR[w] = l |
37 __read = True | 37 __read = True |
38 | 38 |
39 import re | 39 import re |
40 numberre = re.compile("[0-9]") | 40 numberre = re.compile("[0-9]") |
41 slashre = re.compile("\\\/") | |
41 | 42 |
42 def preprocess(l): | 43 def preprocess(l): |
43 """ | 44 """ |
44 Convert a sequence so that it can be embedded directly. | 45 Convert a sequence so that it can be embedded directly. |
45 Returned the preprocessed sequence. | 46 Returned the preprocessed sequence. |
47 @note: Preprocessing is appropriate for Penn Treebank style documents. | |
46 """ | 48 """ |
47 read_embeddings() | 49 read_embeddings() |
48 lnew = [] | 50 lnew = [] |
49 for origw in l: | 51 for origw in l: |
50 if origw == "-LRB-": w = "(" | 52 if origw == "-LRB-": w = "(" |
54 elif origw == "-LSB-": w = "[" | 56 elif origw == "-LSB-": w = "[" |
55 elif origw == "-RSB-": w = "]" | 57 elif origw == "-RSB-": w = "]" |
56 else: | 58 else: |
57 w = origw | 59 w = origw |
58 w = string.lower(w) | 60 w = string.lower(w) |
61 w = slashre.sub("/", w) | |
59 w = numberre.sub("NUMBER", w) | 62 w = numberre.sub("NUMBER", w) |
60 if w not in WORD_TO_VECTOR: | 63 if w not in WORD_TO_VECTOR: |
61 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) | 64 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) |
62 w = UNKNOWN | 65 w = UNKNOWN |
63 assert w in WORD_TO_VECTOR | 66 assert w in WORD_TO_VECTOR |