comparison sandbox/embeddings/read.py @ 460:fda72e944104

\/ -> /
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 23:07:23 -0400
parents f400f62e7f9e
children
comparison
equal deleted inserted replaced
459:f400f62e7f9e 460:fda72e944104
36 WORD_TO_VECTOR[w] = l 36 WORD_TO_VECTOR[w] = l
37 __read = True 37 __read = True
38 38
39 import re 39 import re
40 numberre = re.compile("[0-9]") 40 numberre = re.compile("[0-9]")
41 slashre = re.compile("\\\/")
41 42
42 def preprocess(l): 43 def preprocess(l):
43 """ 44 """
44 Convert a sequence so that it can be embedded directly. 45 Convert a sequence so that it can be embedded directly.
45 Returned the preprocessed sequence. 46 Returned the preprocessed sequence.
47 @note: Preprocessing is appropriate for Penn Treebank style documents.
46 """ 48 """
47 read_embeddings() 49 read_embeddings()
48 lnew = [] 50 lnew = []
49 for origw in l: 51 for origw in l:
50 if origw == "-LRB-": w = "(" 52 if origw == "-LRB-": w = "("
54 elif origw == "-LSB-": w = "[" 56 elif origw == "-LSB-": w = "["
55 elif origw == "-RSB-": w = "]" 57 elif origw == "-RSB-": w = "]"
56 else: 58 else:
57 w = origw 59 w = origw
58 w = string.lower(w) 60 w = string.lower(w)
61 w = slashre.sub("/", w)
59 w = numberre.sub("NUMBER", w) 62 w = numberre.sub("NUMBER", w)
60 if w not in WORD_TO_VECTOR: 63 if w not in WORD_TO_VECTOR:
61 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) 64 sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
62 w = UNKNOWN 65 w = UNKNOWN
63 assert w in WORD_TO_VECTOR 66 assert w in WORD_TO_VECTOR