view sandbox/embeddings/read.py @ 460:fda72e944104

\/ -> /
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 23:07:23 -0400
parents f400f62e7f9e
children
line wrap: on
line source

"""
Read in the weights file
"""

import string
import sys

from files import *

WORDS = None
WORD_TO_VECTOR = None
__read = False
def read_embeddings():
    global WORDS
    global WORD_TO_VECTOR
    global __read 
    if __read: return

    WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()]
    assert len(WORDS) == NUMBER_OF_WORDS

    import numpy, math
    from common.str import percent

    WORD_TO_VECTOR = {}

    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
    f = open(WEIGHTSFILE)
    f.readline()
    vals = [float(v) for v in string.split(f.readline())]
    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
    vals.reverse()
    for i in range(NUMBER_OF_WORDS):
        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
        w = WORDS[i]
        WORD_TO_VECTOR[w] = l
    __read = True

import re
numberre = re.compile("[0-9]")
slashre = re.compile("\\\/")

def preprocess(l):
    """
    Convert a sequence so that it can be embedded directly.
    Returned the preprocessed sequence.
    @note: Preprocessing is appropriate for Penn Treebank style documents.
    """
    read_embeddings()
    lnew = []
    for origw in l:
        if origw == "-LRB-": w = "("
        elif origw == "-RRB-": w = ")"
        elif origw == "-LCB-": w = "{"
        elif origw == "-RCB-": w = "}"
        elif origw == "-LSB-": w = "["
        elif origw == "-RSB-": w = "]"
        else:
            w = origw
            w = string.lower(w)
            w = slashre.sub("/", w)
            w = numberre.sub("NUMBER", w)
        if w not in WORD_TO_VECTOR:
            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
            w = UNKNOWN
        assert w in WORD_TO_VECTOR
        lnew.append(w)
    return lnew

#def convert_string(s, strict=False):
#    """
#    Convert a string to a sequence of embeddings.
#    @param strict: If strict, then words *must* be in the vocabulary.
#    @todo: DEPRECATED Remove this function.
#    """
#    read_embeddings()
#    e = []
#    for origw in string.split(string.lower(s)):
#        w = numberre.sub("NUMBER", origw)
#        if w in WORD_TO_VECTOR:
#            e.append(WORD_TO_VECTOR[w])
#        else:
#            sys.stderr.write("Word not in vocabulary, using %s: %s (original %s)\n" % (UNKNOWN, w, origw))
#            assert not strict
#            e.append(WORD_TO_VECTOR[UNKNOWN])
#    return e