view sandbox/embeddings/process.py @ 461:1243716ade6a

Rearranged
author Joseph Turian <turian@iro.umontreal.ca>
date Wed, 08 Oct 2008 01:09:23 -0400
parents sandbox/embeddings/read.py@fda72e944104
children 121cc6db4481
line wrap: on
line source

"""
Read in the weights file
"""

import string
import sys

from parameters import *

__words = None
__word_to_embedding = None
__read = False

def word_to_embedding(w):
    read_embeddings()
    return __word_to_embedding[w]

def read_embeddings():
    global __words
    global __word_to_embedding
    global __read 
    if __read: return

    __words = [string.strip(w) for w in open(VOCABFILE).readlines()]
    assert len(__words) == NUMBER_OF_WORDS

    import numpy, math
    from common.str import percent

    __word_to_embedding = {}

    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
    f = open(WEIGHTSFILE)
    f.readline()
    vals = [float(v) for v in string.split(f.readline())]
    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
    vals.reverse()
    for i in range(NUMBER_OF_WORDS):
        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
        w = __words[i]
        __word_to_embedding[w] = l
    __read = True

import re
numberre = re.compile("[0-9]")
slashre = re.compile("\\\/")

def preprocess(l):
    """
    Convert a sequence so that it can be embedded directly.
    Returned the preprocessed sequence.
    @note: Preprocessing is appropriate for Penn Treebank style documents.
    """
    read_embeddings()
    lnew = []
    for origw in l:
        if origw == "-LRB-": w = "("
        elif origw == "-RRB-": w = ")"
        elif origw == "-LCB-": w = "{"
        elif origw == "-RCB-": w = "}"
        elif origw == "-LSB-": w = "["
        elif origw == "-RSB-": w = "]"
        else:
            w = origw
            w = string.lower(w)
            w = slashre.sub("/", w)
            w = numberre.sub("NUMBER", w)
        if w not in __word_to_embedding:
            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
            w = UNKNOWN
        assert w in __word_to_embedding
        lnew.append(w)
    return lnew

#def convert_string(s, strict=False):
#    """
#    Convert a string to a sequence of embeddings.
#    @param strict: If strict, then words *must* be in the vocabulary.
#    @todo: DEPRECATED Remove this function.
#    """
#    read_embeddings()
#    e = []
#    for origw in string.split(string.lower(s)):
#        w = numberre.sub("NUMBER", origw)
#        if w in __word_to_embedding:
#            e.append(__word_to_embedding[w])
#        else:
#            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
#            assert not strict
#            e.append(__word_to_embedding[UNKNOWN])
#    return e