Mercurial > pylearn
comparison embeddings/process.py @ 672:27b1344a57b1
Added preprocessing back in
author | Joseph Turian <turian@gmail.com> |
---|---|
date | Thu, 20 Nov 2008 06:38:06 -0500 |
parents | c6563c629984 |
children |
comparison
equal
deleted
inserted
replaced
536:c6563c629984 | 672:27b1344a57b1 |
---|---|
48 for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w]) | 48 for w in __word_to_embedding: assert len(__word_to_embedding[__words[0]]) == len(__word_to_embedding[w]) |
49 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) | 49 sys.stderr.write("...done reading %s\n" % WEIGHTSFILE) |
50 | 50 |
51 import re | 51 import re |
52 numberre = re.compile("[0-9]") | 52 numberre = re.compile("[0-9]") |
53 | 53 slashre = re.compile("\\\/") |
54 def preprocess_word(w): | 54 |
55 def preprocess_word(origw): | |
55 """ | 56 """ |
56 Convert a word so that it can be embedded directly. | 57 Convert a word so that it can be embedded directly. |
57 Returned the preprocessed sequence. | 58 Returned the preprocessed sequence. |
58 @note: Perhaps run L{common.penntreebank.preprocess} on the word first. | 59 @note: Preprocessing is appropriate for Penn Treebank style documents. |
60 #@note: Perhaps run L{common.penntreebank.preprocess} on the word first. | |
59 """ | 61 """ |
60 read_embeddings() | 62 read_embeddings() |
61 if w not in __word_to_embedding: | 63 if origw == "-LRB-": w = "(" |
62 w = string.lower(w) | 64 elif origw == "-RRB-": w = ")" |
63 w = numberre.sub("NUMBER", w) | 65 elif origw == "-LCB-": w = "{" |
66 elif origw == "-RCB-": w = "}" | |
67 elif origw == "-LSB-": w = "[" | |
68 elif origw == "-RSB-": w = "]" | |
69 else: | |
70 w = origw | |
71 if w not in __word_to_embedding: | |
72 w = string.lower(w) | |
73 w = slashre.sub("/", w) | |
74 w = numberre.sub("NUMBER", w) | |
75 # if w not in __word_to_embedding: | |
76 # w = string.lower(w) | |
77 # w = numberre.sub("NUMBER", w) | |
64 if w not in __word_to_embedding: | 78 if w not in __word_to_embedding: |
65 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) | 79 # sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw)) |
66 w = UNKNOWN | 80 w = UNKNOWN |
67 assert w in __word_to_embedding | 81 assert w in __word_to_embedding |
68 return w | 82 return w |