Mercurial > pylearn
view featuremap.py @ 363:9e84e8a20a75
Added to misc.file
author | Joseph Turian <turian@gmail.com> |
---|---|
date | Thu, 03 Jul 2008 17:52:11 -0400 |
parents | 18702ceb2096 |
children |
line wrap: on
line source
""" Feature mapping. A feature map is idenfied by a unique name, e.g. "parsing features, experiment 35". This unique name also determines the name of the on-disk version of the feature map. @todo: This should be rewritten to be more Pythonic. Perhaps use a class? @todo: Maybe look at older C++ Id/Vocab code? Id could have a __str__ method @todo: Clearer documentation. @todo: Create an fmap directory @todo: Use cPickle, not pickle @todo: Autosynchronize mode: Each time a new entry is added to a L{FeatureMap}, the on-disk version of the feature map is updated. Alternately, synchronize to disk when the object is destroyed. """ from common import myopen import pickle # We want this map to be a singleton name_to_fmap = {} def get(name=None, synchronize=True): """ Get the L{FeatureMap} for a particular feature name. """ global name_to_fmap if name not in name_to_fmap: # Create a new L{FeatureMap} name_to_fmap[name] = FeatureMap(name, synchronize) fmap = name_to_fmap[name] assert fmap.name == name assert fmap.synchronize == synchronize return fmap def free_memory(): """ Free the memory associated with all feature maps. """ global name_to_fmap name_to_fmap = {} class KeyError(Exception): """Exception raised for keys missing from a readonly FeatureMap Attributes: name -- Name of the FeatureMap raising the error. key -- Key not present. """ def __init__(self, name, key): self.name = name self.key = key class FeatureMap: """ Map from a feature string to a numerial ID (starting from 0). If synchronize is False, the feature map is considered temporary and we never actually synchronize it with disk. It expires with the lifetime of this execution. @warning: Do not construct this directly. Instead, use the global get() method. @todo: More documentation """ # name = None # synchronize = True # map = {} # readonly = False # If True, then each time we look for an ID # that is not present we throw a ValueError def __init__(self, name=None, synchronize=True): self.name = name self.synchronize = synchronize self.map = {} self.reverse_map = {} self.readonly = False # There must be a name provided, or we cannot perform synchronization assert self.name or not self.synchronize if self.synchronize: # Try loading map from disk self.load() def exists(self, str): """ Return True iff this str is in the map """ return str in self.map def id(self, str): """ Get the ID for this string. Add a new ID if not is available """ """ @todo: Don't want to synchronize every add, this may be too slow. """ if str not in self.map: if self.readonly: raise KeyError(self.name, str) l = self.len self.map[str] = l self.reverse_map[l] = str assert l+1 == self.len return l else: return self.map[str] def str(self, id): """ Get the string for this ID. """ return self.reverse_map[id] # This next function should just convert a list to a list # def ids(self, lst): # """ Get the IDs for the elements of a list. Return the ID numbers of these keys as a map. """ # idset = {} # for k in lst: # try: # idset[self.id(k)] = True # except KeyError, e: # print "Feature map '%s' does not contain key '%s'. Skipping..." % (e.name, e.key) # return idset len = property(lambda self: len(self.map), doc="Number of different feature IDs") filename = property(lambda self: "fmap.%s.pkl.gz" % self.name, doc="The on-disk file synchronized to this feature map.") def load(self): """ Load the map from disk. """ assert self.synchronize try: f = myopen(self.filename, "rb") (self.map, self.reverse_map) = pickle.load(f) except IOError: print "Could not open %s" % self.filename def dump(self): """ Dump the map to disk. """ assert self.synchronize f = myopen(self.filename, "wb") pickle.dump((self.map, self.reverse_map), f)