changeset 458:ed6b0b3be8d2

Polished embeddings module
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 19:13:53 -0400
parents 34acf8db186d
children f400f62e7f9e
files sandbox/embeddings/README.txt sandbox/embeddings/convert.py sandbox/embeddings/files.py sandbox/embeddings/original.py sandbox/embeddings/read.py
diffstat 5 files changed, 67 insertions(+), 55 deletions(-) [+]
line wrap: on
line diff
--- a/sandbox/embeddings/README.txt	Tue Oct 07 18:21:32 2008 -0400
+++ b/sandbox/embeddings/README.txt	Tue Oct 07 19:13:53 2008 -0400
@@ -1,1 +1,3 @@
 Messy scripts for working with Jason + Ronan's embeddings.
+
+File locations are given in files.py
--- a/sandbox/embeddings/convert.py	Tue Oct 07 18:21:32 2008 -0400
+++ b/sandbox/embeddings/convert.py	Tue Oct 07 19:13:53 2008 -0400
@@ -1,12 +1,15 @@
 #!/usr/bin/python
+"""
+Convert stdin sentences to word embeddings, and output YAML.
+"""
 
 import sys, string
-import original
+import read
 import yaml
 
 output = []
 for l in sys.stdin:
     l = string.strip(l)
-    output.append((l, original.convert_string(l)))
+    output.append((l, read.convert_string(l)))
 
 print yaml.dump(output)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/files.py	Tue Oct 07 19:13:53 2008 -0400
@@ -0,0 +1,7 @@
+"""
+Locations of the embedding data files.
+"""
+WEIGHTSFILE     = "/u/turian/data/word_embeddings.collobert-and-weston/lm-weights.txt"
+VOCABFILE       = "/u/turian/data/word_embeddings.collobert-and-weston/words.asc"
+NUMBER_OF_WORDS = 30000
+DIMENSIONS      = 50
--- a/sandbox/embeddings/original.py	Tue Oct 07 18:21:32 2008 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,53 +0,0 @@
-"""
-Read in the weights file
-"""
-
-import string
-import sys
-
-WORDS = None
-WORD_TO_VECTOR = None
-def read():
-    global WORDS
-    global WORD_TO_VECTOR
-
-    weightsfile = "lm-weights.txt"
-    vocabfile = "words.asc"
-    size = 30000
-    dimensions = 50
-
-    WORDS = [string.strip(w) for w in open(vocabfile).readlines()]
-    assert len(WORDS) == 30000
-
-    import numpy, math
-    from common.str import percent
-
-    WORD_TO_VECTOR = {}
-
-    sys.stderr.write("Reading %s...\n" % weightsfile)
-    f = open(weightsfile)
-    f.readline()
-    vals = [float(v) for v in string.split(f.readline())]
-    assert len(vals) == size * dimensions
-    vals.reverse()
-    for i in range(size):
-        l = vals[dimensions*i:dimensions*(i+1)]
-        w = WORDS[i]
-        WORD_TO_VECTOR[w] = l
-
-import re
-numberre = re.compile("[0-9]")
-def convert_string(s):
-    """
-    Convert a string to a sequence of embeddings.
-    """
-    e = []
-    for origw in string.split(string.lower(s)):
-        w = numberre.sub("NUMBER", origw)
-        if w in WORD_TO_VECTOR:
-            e.append(WORD_TO_VECTOR[w])
-        else:
-            sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw))
-    return e
-
-read()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/sandbox/embeddings/read.py	Tue Oct 07 19:13:53 2008 -0400
@@ -0,0 +1,53 @@
+"""
+Read in the weights file
+"""
+
+import string
+import sys
+
+from files import *
+
+WORDS = None
+WORD_TO_VECTOR = None
+__read = False
+def read_embeddings():
+    global WORDS
+    global WORD_TO_VECTOR
+    global __read 
+    if __read: return
+
+    WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()]
+    assert len(WORDS) == NUMBER_OF_WORDS
+
+    import numpy, math
+    from common.str import percent
+
+    WORD_TO_VECTOR = {}
+
+    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
+    f = open(WEIGHTSFILE)
+    f.readline()
+    vals = [float(v) for v in string.split(f.readline())]
+    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
+    vals.reverse()
+    for i in range(NUMBER_OF_WORDS):
+        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
+        w = WORDS[i]
+        WORD_TO_VECTOR[w] = l
+    __read = True
+
+import re
+numberre = re.compile("[0-9]")
+def convert_string(s):
+    """
+    Convert a string to a sequence of embeddings.
+    """
+    read_embeddings()
+    e = []
+    for origw in string.split(string.lower(s)):
+        w = numberre.sub("NUMBER", origw)
+        if w in WORD_TO_VECTOR:
+            e.append(WORD_TO_VECTOR[w])
+        else:
+            sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw))
+    return e