pylearn: sandbox/embeddings/read.py comparison

Polished embeddings module

author	Joseph Turian <turian@iro.umontreal.ca>
date	Tue, 07 Oct 2008 19:13:53 -0400
parents	sandbox/embeddings/original.py@131e19dfe793
children	f400f62e7f9e

comparison

equal deleted inserted replaced

-:34acf8db186d
+:ed6b0b3be8d2
+"""
+Read in the weights file
+"""
+import string
+import sys
+from files import *
+WORDS = None
+WORD_TO_VECTOR = None
+__read = False
+def read_embeddings():
+global WORDS
+global WORD_TO_VECTOR
+global __read
+if __read: return
+WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()]
+assert len(WORDS) == NUMBER_OF_WORDS
+import numpy, math
+from common.str import percent
+WORD_TO_VECTOR = {}
+sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
+f = open(WEIGHTSFILE)
+f.readline()
+vals = [float(v) for v in string.split(f.readline())]
+assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
+vals.reverse()
+for i in range(NUMBER_OF_WORDS):
+l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
+w = WORDS[i]
+WORD_TO_VECTOR[w] = l
+__read = True
+import re
+numberre = re.compile("[0-9]")
+def convert_string(s):
+"""
+Convert a string to a sequence of embeddings.
+"""
+read_embeddings()
+e = []
+for origw in string.split(string.lower(s)):
+w = numberre.sub("NUMBER", origw)
+if w in WORD_TO_VECTOR:
+e.append(WORD_TO_VECTOR[w])
+else:
+sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw))
+return e

Mercurial > pylearn