comparison embeddings/process.py @ 672:27b1344a57b1

Added preprocessing back in
author Joseph Turian <turian@gmail.com>
date Thu, 20 Nov 2008 06:38:06 -0500
parents c6563c629984
children
comparison
equal deleted inserted replaced
536:c6563c629984 672:27b1344a57b1
48 for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w]) 48 for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w])
49 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) 49 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)
50 50
51 import re 51 import re
52 numberre = re.compile("[0-9]") 52 numberre = re.compile("[0-9]")
53 53 slashre = re.compile("\\\/")
54 def preprocess_word(w): 54
55 def preprocess_word(origw):
55 """ 56 """
56 Convert a word so that it can be embedded directly. 57 Convert a word so that it can be embedded directly.
57 Returned the preprocessed sequence. 58 Returned the preprocessed sequence.
58 @note: Perhaps run L{common.penntreebank.preprocess} on the word first. 59 @note: Preprocessing is appropriate for Penn Treebank style documents.
60 #@note: Perhaps run L{common.penntreebank.preprocess} on the word first.
59 """ 61 """
60 read_embeddings() 62 read_embeddings()
61 if w not in __word_to_embedding: 63 if origw == "-LRB-": w = "("
62 w = string.lower(w) 64 elif origw == "-RRB-": w = ")"
63 w = numberre.sub("NUMBER", w) 65 elif origw == "-LCB-": w = "{"
66 elif origw == "-RCB-": w = "}"
67 elif origw == "-LSB-": w = "["
68 elif origw == "-RSB-": w = "]"
69 else:
70 w = origw
71 if w not in __word_to_embedding:
72 w = string.lower(w)
73 w = slashre.sub("/", w)
74 w = numberre.sub("NUMBER", w)
75 # if w not in __word_to_embedding:
76 # w = string.lower(w)
77 # w = numberre.sub("NUMBER", w)
64 if w not in __word_to_embedding: 78 if w not in __word_to_embedding:
65 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) 79 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
66 w = UNKNOWN 80 w = UNKNOWN
67 assert w in __word_to_embedding 81 assert w in __word_to_embedding
68 return w 82 return w