Mercurial > pylearn
changeset 920:a5c33f01c9a4
removed bad tracks from MajorMiner dataset
author | James Bergstra <bergstrj@iro.umontreal.ca> |
---|---|
date | Sat, 20 Mar 2010 15:19:43 -0400 |
parents | 3901d06e2d96 |
children | 3476044d7326 |
files | pylearn/datasets/majorminer.py |
diffstat | 1 files changed, 81 insertions(+), 2 deletions(-) [+] |
line wrap: on
line diff
--- a/pylearn/datasets/majorminer.py Sat Mar 20 15:18:54 2010 -0400 +++ b/pylearn/datasets/majorminer.py Sat Mar 20 15:19:43 2010 -0400 @@ -2,7 +2,7 @@ Load the MajorMiner dataset """ -import logging, os +import logging, os,sys from .config import data_root _logger = logging.getLogger('pylearn.datasets.majorminer') @@ -37,6 +37,82 @@ return tag_count_track +try: + import mad +except ImportError: + pass + +def remove_bad_tracks(three_col, min_seconds=8): + """Heuristically filter the three_col data to contain only valid tracks + """ + bad_tracks = set() + all_tracks = set() + + silent_tracks = [] + missing_in_action = [] + too_short = [] + + try: + _file = mad.MadFile + test_len = True + except: + _file = file + test_len = False + + + for tag, count, track in three_col: + if track in all_tracks: + continue + all_tracks.add(track) + if tag in set(['silence', 'end', 'nothing']): + bad_tracks.add(track) + silent_tracks.append(track) + _logger.debug("silent file: %s" % track) + continue + + try: + t = _file(track) + except IOError: + bad_tracks.add(track) + missing_in_action.append(track) + _logger.debug("missing file: %s"% track) + # it is normal to have 2 + #if len(missing_in_action) > 5: + #raise Exception('Too many missing files:', missing_in_action) + continue + + if test_len and t.total_time() < min_seconds*1000: + # too short + bad_tracks.add(track) + _logger.debug("short file: %f %s" %(t.total_time(), track)) + too_short.append((track, t.total_time())) + # it is normal to have maybe 10? + #if len(too_short) > 40: + #raise Exception('Too many short files:', too_short) + continue + + if silent_tracks: + _logger.warning("removed %i silent files"% len(silent_tracks)) + + if missing_in_action: + _logger.warning("missing %i files"% len(missing_in_action)) + + if too_short: + _logger.warning("discarded %i files less than %f seconds long"%( + len(too_short), min_seconds)) + + _logger.info("kept %i of %i tracks"% (len(all_tracks)-len(bad_tracks), + len(all_tracks))) + + # return a cleaned three_column list + rval = [] + for tag, count, track in three_col: + if track not in bad_tracks: + rval.append((tag, count, track)) + return rval + + + def list_tracks(three_col): tracks = list(set(tup[2] for tup in three_col)) tracks.sort() @@ -64,8 +140,11 @@ class Meta(object): - def __init__(self, tagfile=None, trackroot=None, expected_tagfile_len=51556): + def __init__(self, tagfile=None, trackroot=None, expected_tagfile_len=51556, + filter_broken=True): self.three_column = three_column(tagfile, trackroot, expected_tagfile_len) + if filter_broken: + self.three_column = remove_bad_tracks(self.three_column) self.tracks = list_tracks(self.three_column) self.tags = list_tags(self.three_column) self.track_tags = track_tags(self.three_column, self.tracks, self.tags)