Mercurial > pylearn
diff featuremap.py @ 356:18702ceb2096
Added more functions
author | Joseph Turian <turian@iro.umontreal.ca> |
---|---|
date | Thu, 19 Jun 2008 16:18:37 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/featuremap.py Thu Jun 19 16:18:37 2008 -0400 @@ -0,0 +1,132 @@ +""" +Feature mapping. + +A feature map is idenfied by a unique name, e.g. "parsing features, experiment 35". +This unique name also determines the name of the on-disk version of the feature map. + +@todo: This should be rewritten to be more Pythonic. Perhaps use a class? +@todo: Maybe look at older C++ Id/Vocab code? Id could have a __str__ method +@todo: Clearer documentation. +@todo: Create an fmap directory +@todo: Use cPickle, not pickle + +@todo: Autosynchronize mode: Each time a new entry is added +to a L{FeatureMap}, the on-disk version of the feature map is +updated. Alternately, synchronize to disk when the object is destroyed. +""" + +from common import myopen +import pickle + +# We want this map to be a singleton +name_to_fmap = {} + +def get(name=None, synchronize=True): + """ + Get the L{FeatureMap} for a particular feature name. + """ + global name_to_fmap + if name not in name_to_fmap: + # Create a new L{FeatureMap} + name_to_fmap[name] = FeatureMap(name, synchronize) + fmap = name_to_fmap[name] + assert fmap.name == name + assert fmap.synchronize == synchronize + return fmap + +def free_memory(): + """ + Free the memory associated with all feature maps. + """ + global name_to_fmap + name_to_fmap = {} + +class KeyError(Exception): + """Exception raised for keys missing from a readonly FeatureMap + Attributes: + name -- Name of the FeatureMap raising the error. + key -- Key not present. + """ + def __init__(self, name, key): + self.name = name + self.key = key + + +class FeatureMap: + """ + Map from a feature string to a numerial ID (starting from 0). + + If synchronize is False, the feature map is considered temporary + and we never actually synchronize it with disk. It expires with the + lifetime of this execution. + + @warning: Do not construct this directly. Instead, use the global get() method. + @todo: More documentation + """ + +# name = None +# synchronize = True +# map = {} +# readonly = False # If True, then each time we look for an ID + # that is not present we throw a ValueError + def __init__(self, name=None, synchronize=True): + self.name = name + self.synchronize = synchronize + self.map = {} + self.reverse_map = {} + self.readonly = False + + # There must be a name provided, or we cannot perform synchronization + assert self.name or not self.synchronize + + if self.synchronize: + # Try loading map from disk + self.load() + + def exists(self, str): + """ Return True iff this str is in the map """ + return str in self.map + + def id(self, str): + """ Get the ID for this string. Add a new ID if not is available """ + """ @todo: Don't want to synchronize every add, this may be too slow. """ + if str not in self.map: + if self.readonly: raise KeyError(self.name, str) + l = self.len + self.map[str] = l + self.reverse_map[l] = str + assert l+1 == self.len + return l + else: return self.map[str] + + def str(self, id): + """ Get the string for this ID. """ + return self.reverse_map[id] + + # This next function should just convert a list to a list +# def ids(self, lst): +# """ Get the IDs for the elements of a list. Return the ID numbers of these keys as a map. """ +# idset = {} +# for k in lst: +# try: +# idset[self.id(k)] = True +# except KeyError, e: +# print "Feature map '%s' does not contain key '%s'. Skipping..." % (e.name, e.key) +# return idset + + len = property(lambda self: len(self.map), doc="Number of different feature IDs") + filename = property(lambda self: "fmap.%s.pkl.gz" % self.name, doc="The on-disk file synchronized to this feature map.") + + def load(self): + """ Load the map from disk. """ + assert self.synchronize + try: + f = myopen(self.filename, "rb") + (self.map, self.reverse_map) = pickle.load(f) + except IOError: print "Could not open %s" % self.filename + + def dump(self): + """ Dump the map to disk. """ + assert self.synchronize + f = myopen(self.filename, "wb") + pickle.dump((self.map, self.reverse_map), f)