view sandbox/embeddings/read.py @ 458:ed6b0b3be8d2

Polished embeddings module
author Joseph Turian <turian@iro.umontreal.ca>
date Tue, 07 Oct 2008 19:13:53 -0400
parents sandbox/embeddings/original.py@131e19dfe793
children f400f62e7f9e
line wrap: on
line source

"""
Read in the weights file
"""

import string
import sys

from files import *

WORDS = None
WORD_TO_VECTOR = None
__read = False
def read_embeddings():
    global WORDS
    global WORD_TO_VECTOR
    global __read 
    if __read: return

    WORDS = [string.strip(w) for w in open(VOCABFILE).readlines()]
    assert len(WORDS) == NUMBER_OF_WORDS

    import numpy, math
    from common.str import percent

    WORD_TO_VECTOR = {}

    sys.stderr.write("Reading %s...\n" % WEIGHTSFILE)
    f = open(WEIGHTSFILE)
    f.readline()
    vals = [float(v) for v in string.split(f.readline())]
    assert len(vals) == NUMBER_OF_WORDS * DIMENSIONS
    vals.reverse()
    for i in range(NUMBER_OF_WORDS):
        l = vals[DIMENSIONS*i:DIMENSIONS*(i+1)]
        w = WORDS[i]
        WORD_TO_VECTOR[w] = l
    __read = True

import re
numberre = re.compile("[0-9]")
def convert_string(s):
    """
    Convert a string to a sequence of embeddings.
    """
    read_embeddings()
    e = []
    for origw in string.split(string.lower(s)):
        w = numberre.sub("NUMBER", origw)
        if w in WORD_TO_VECTOR:
            e.append(WORD_TO_VECTOR[w])
        else:
            sys.stderr.write("Word not in vocabulary: %s (original %s)\n" % (w, origw))
    return e