changeset 673:8fff4bc26f4c

merge
author Joseph Turian <turian@gmail.com>
date Mon, 30 Mar 2009 20:48:04 -0400
parents 27b1344a57b1 (diff) 9e62fd6b6677 (current diff)
children a5a41b7ddd26
files __init__.py _test_dataset.py _test_filetensor.py _test_linear_regression.py _test_lookup_list.py _test_nnet_ops.py _test_onehotop.py _test_random_transformation.py _test_xlogx.py activation.py algorithms/__init__.py algorithms/_test_logistic_regression.py algorithms/aa.py algorithms/daa.py algorithms/layer.py algorithms/logistic_regression.py algorithms/regressor.py algorithms/sgd.py algorithms/stacker.py algorithms/tests/test_aa.py algorithms/tests/test_daa.py algorithms/tests/test_regressor.py algorithms/tests/test_stacker.py amat.py autotest.py cost.py dataset.py datasets/MNIST.py datasets/__init__.py datasets/config.py datasets/dataset.py datasets/shapeset1.py datasets/smallNorb.py embeddings/README.txt embeddings/__init__.py embeddings/convert.py embeddings/one-per-line.py embeddings/parameters.py embeddings/process.py embeddings/read-original.py examples/linear_classifier.py examples/theano_update.py exceptions.py external/wrap_libsvm.py filetensor.py image_tools.py kernel_regression.py learner.py linear_regression.py lookup_list.py make_test_datasets.py misc_theano.py mlp_factory_approach.py nnet_ops.py noise.py onehotop.py onehotop.py.scalar pmat.py pylearn/datasets/embeddings/process.py random_transformation.py sandbox/README.txt sandbox/__init__.py sandbox/denoising_aa.py sandbox/gradient_learner.py sandbox/rbm/README.txt sandbox/rbm/__init__.py sandbox/rbm/main.py sandbox/rbm/model.py sandbox/rbm/parameters.py sandbox/simple_autoassociator/README.txt sandbox/simple_autoassociator/__init__.py sandbox/simple_autoassociator/graph.py sandbox/simple_autoassociator/main.py sandbox/simple_autoassociator/model.py sandbox/simple_autoassociator/parameters.py sandbox/sparse_random_autoassociator/README.txt sandbox/sparse_random_autoassociator/__init__.py sandbox/sparse_random_autoassociator/globals.py sandbox/sparse_random_autoassociator/graph.py sandbox/sparse_random_autoassociator/main.py sandbox/sparse_random_autoassociator/model.py sandbox/sparse_random_autoassociator/parameters.py sandbox/statscollector.py sparse_instance.py squashfn.py stat_ops.py stopper.py test_speed.py version.py weights.py xlogx.py
diffstat 1 files changed, 20 insertions(+), 6 deletions(-) [+]
line wrap: on
line diff
--- a/pylearn/datasets/embeddings/process.py	Mon Mar 30 19:51:13 2009 -0400
+++ b/pylearn/datasets/embeddings/process.py	Mon Mar 30 20:48:04 2009 -0400
@@ -50,17 +50,31 @@
 
 import re
 numberre = re.compile("[0-9]")
-
-def preprocess_word(w):
+slashre = re.compile("\\\/")
+ 
+def preprocess_word(origw):
     """
     Convert a word so that it can be embedded directly.
     Returned the preprocessed sequence.
-    @note: Perhaps run L{common.penntreebank.preprocess} on the word first.
+    @note: Preprocessing is appropriate for Penn Treebank style documents.
+    #@note: Perhaps run L{common.penntreebank.preprocess} on the word first.
     """
     read_embeddings()
-    if w not in __word_to_embedding:
-        w = string.lower(w)
-        w = numberre.sub("NUMBER", w)
+    if origw == "-LRB-": w = "("
+    elif origw == "-RRB-": w = ")"
+    elif origw == "-LCB-": w = "{"
+    elif origw == "-RCB-": w = "}"
+    elif origw == "-LSB-": w = "["
+    elif origw == "-RSB-": w = "]"
+    else:
+        w = origw
+        if w not in __word_to_embedding:
+            w = string.lower(w)
+            w = slashre.sub("/", w)
+            w = numberre.sub("NUMBER", w)
+#    if w not in __word_to_embedding:
+#        w = string.lower(w)
+#        w = numberre.sub("NUMBER", w)
     if w not in __word_to_embedding:
 #        sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
         w = UNKNOWN