view embeddings/process.py @ 498:2be795cc5c3a

More documentation + todo
author Joseph Turian <turian@gmail.com>
date Tue, 28 Oct 2008 12:25:15 -0400
parents 4335309f4924
children 90a76a8238e8
line wrap: on
line source

"""
Read in the weights file
"""

import string
import sys

from parameters import *

__words = None
__word_to_embedding = None
__read = False

def word_to_embedding(w):
    read_embeddings()
    return __word_to_embedding[w]

def read_embeddings():
    global __words
    global __word_to_embedding
    global __read 
    if __read: return

    __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
    assert len(__words) == NUMBER_OF_WORDS

    import numpy, math
    from common.str import percent

    __word_to_embedding = {}

    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
    f = open(WEIGHTSFILE)
    f.readline()
    vals = [float(v) for v in string.split(f.readline())]
    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
    for i in range(NUMBER_OF_WORDS):
        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
        w = __words[i]
        __word_to_embedding[w] = l
    __read = True
    sys.stderr.write("...done reading %s\n" % WEIGHTSFILE)

import re
numberre = re.compile("[0-9]")
slashre = re.compile("\\\/")

def preprocess_word(origw):
    """
    Convert a word so that it can be embedded directly.
    Returned the preprocessed sequence.
    @note: Preprocessing is appropriate for Penn Treebank style documents.
    """
    read_embeddings()
    if origw == "-LRB-": w = "("
    elif origw == "-RRB-": w = ")"
    elif origw == "-LCB-": w = "{"
    elif origw == "-RCB-": w = "}"
    elif origw == "-LSB-": w = "["
    elif origw == "-RSB-": w = "]"
    else:
        w = origw
        w = string.lower(w)
        w = slashre.sub("/", w)
        w = numberre.sub("NUMBER", w)
    if w not in __word_to_embedding:
#        sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
        w = UNKNOWN
    assert w in __word_to_embedding
    return w

def preprocess_seq(l):
    """
    Convert a sequence so that it can be embedded directly.
    Returned the preprocessed sequence.
    @note: Preprocessing is appropriate for Penn Treebank style documents.
    """
    read_embeddings()
    lnew = []
    for origw in l:
        w = preprocess_word(origw)
        lnew.append(w)
    return lnew

#def convert_string(s, strict=False):
#    """
#    Convert a string to a sequence of embeddings.
#    @param strict: If strict, then words *must* be in the vocabulary.
#    @todo: DEPRECATED Remove this function.
#    """
#    read_embeddings()
#    e = []
#    for origw in string.split(string.lower(s)):
#        w = numberre.sub("NUMBER", origw)
#        if w in __word_to_embedding:
#            e.append(__word_to_embedding[w])
#        else:
#            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
#            assert not strict
#            e.append(__word_to_embedding[UNKNOWN])
#    return e

#def test():
#    """
#    Debugging code.
#    """
#    read_embeddings()
#    for w in __word_to_embedding:
#        assert len(__word_to_embedding[w]) == 50
#    import numpy
#    for w1 in __words:
#        e1 = numpy.asarray(__word_to_embedding[w1])
#        lst = []
#        print w1, numpy.dot(e1, e1)
#        for w2 in __word_to_embedding:
#            if w1 >= w2: continue
#            e2 = numpy.asarray(__word_to_embedding[w2])
#            d = (e1 - e2)
#            l2 = numpy.dot(d, d)
#            lst.append((l2, w1, w2))
#        lst.sort()
#        print lst[:10]
#
#test()