comparison dataset.py @ 167:4803cb76e26b

Updated documentation
author Joseph Turian <turian@gmail.com>
date Mon, 12 May 2008 18:51:42 -0400
parents ee11ed427ba8
children 895b4b60f5e8
comparison
equal deleted inserted replaced
166:ee11ed427ba8 167:4803cb76e26b
528 return datasets[0] 528 return datasets[0]
529 return VStackedDataSet(datasets) 529 return VStackedDataSet(datasets)
530 530
531 class FieldsSubsetDataSet(DataSet): 531 class FieldsSubsetDataSet(DataSet):
532 """ 532 """
533 A sub-class of DataSet that selects a subset of the fields. 533 A sub-class of L{DataSet} that selects a subset of the fields.
534 """ 534 """
535 def __init__(self,src,fieldnames): 535 def __init__(self,src,fieldnames):
536 self.src=src 536 self.src=src
537 self.fieldnames=fieldnames 537 self.fieldnames=fieldnames
538 assert src.hasFields(*fieldnames) 538 assert src.hasFields(*fieldnames)
569 return FieldsSubsetDataSet(self.src[i],self.fieldnames) 569 return FieldsSubsetDataSet(self.src[i],self.fieldnames)
570 570
571 571
572 class DataSetFields(LookupList): 572 class DataSetFields(LookupList):
573 """ 573 """
574 Although a DataSet iterates over examples (like rows of a matrix), an associated 574 Although a L{DataSet} iterates over examples (like rows of a matrix), an associated
575 DataSetFields iterates over fields (like columns of a matrix), and can be understood 575 DataSetFields iterates over fields (like columns of a matrix), and can be understood
576 as a transpose of the associated dataset. 576 as a transpose of the associated dataset.
577 577
578 To iterate over fields, one can do 578 To iterate over fields, one can do
579 * for fields in dataset.fields() 579 * for fields in dataset.fields()
636 return (self.examples() | other.examples()).fields() 636 return (self.examples() | other.examples()).fields()
637 637
638 638
639 class MinibatchDataSet(DataSet): 639 class MinibatchDataSet(DataSet):
640 """ 640 """
641 Turn a LookupList of same-length (iterable) fields into an example-iterable dataset. 641 Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset.
642 Each element of the lookup-list should be an iterable and sliceable, all of the same length. 642 Each element of the lookup-list should be an iterable and sliceable, all of the same length.
643 """ 643 """
644 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, 644 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack,
645 values_hstack=DataSet().valuesHStack): 645 values_hstack=DataSet().valuesHStack):
646 """ 646 """
706 def valuesHStack(self,fieldnames,fieldvalues): 706 def valuesHStack(self,fieldnames,fieldvalues):
707 return self.values_hstack(fieldnames,fieldvalues) 707 return self.values_hstack(fieldnames,fieldvalues)
708 708
709 class HStackedDataSet(DataSet): 709 class HStackedDataSet(DataSet):
710 """ 710 """
711 A DataSet that wraps several datasets and shows a view that includes all their fields, 711 A L{DataSet} that wraps several datasets and shows a view that includes all their fields,
712 i.e. whose list of fields is the concatenation of their lists of fields. 712 i.e. whose list of fields is the concatenation of their lists of fields.
713 713
714 If a field name is found in more than one of the datasets, then either an error is 714 If a field name is found in more than one of the datasets, then either an error is
715 raised or the fields are renamed (either by prefixing the __name__ attribute 715 raised or the fields are renamed (either by prefixing the __name__ attribute
716 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). 716 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list).
717 717
718 TODO: automatically detect a chain of stacked datasets due to A | B | C | D ... 718 @todo: automatically detect a chain of stacked datasets due to A | B | C | D ...
719 """ 719 """
720 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): 720 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None):
721 DataSet.__init__(self,description,field_types) 721 DataSet.__init__(self,description,field_types)
722 self.datasets=datasets 722 self.datasets=datasets
723 self.accept_nonunique_names=accept_nonunique_names 723 self.accept_nonunique_names=accept_nonunique_names
805 """ 805 """
806 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues) 806 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues)
807 807
808 class VStackedDataSet(DataSet): 808 class VStackedDataSet(DataSet):
809 """ 809 """
810 A DataSet that wraps several datasets and shows a view that includes all their examples, 810 A L{DataSet} that wraps several datasets and shows a view that includes all their examples,
811 in the order provided. This clearly assumes that they all have the same field names 811 in the order provided. This clearly assumes that they all have the same field names
812 and all (except possibly the last one) are of finite length. 812 and all (except possibly the last one) are of finite length.
813 813
814 TODO: automatically detect a chain of stacked datasets due to A + B + C + D ... 814 @todo: automatically detect a chain of stacked datasets due to A + B + C + D ...
815 """ 815 """
816 def __init__(self,datasets): 816 def __init__(self,datasets):
817 self.datasets=datasets 817 self.datasets=datasets
818 self.length=0 818 self.length=0
819 self.index2dataset={} 819 self.index2dataset={}
1023 return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) 1023 return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset)
1024 1024
1025 1025
1026 class CachedDataSet(DataSet): 1026 class CachedDataSet(DataSet):
1027 """ 1027 """
1028 Wrap a dataset whose values are computationally expensive to obtain 1028 Wrap a L{DataSet} whose values are computationally expensive to obtain
1029 (e.g. because they involve some computation, or disk access), 1029 (e.g. because they involve some computation, or disk access),
1030 so that repeated accesses to the same example are done cheaply, 1030 so that repeated accesses to the same example are done cheaply,
1031 by caching every example value that has been accessed at least once. 1031 by caching every example value that has been accessed at least once.
1032 1032
1033 Optionally, for finite-length dataset, all the values can be computed 1033 Optionally, for finite-length dataset, all the values can be computed
1034 (and cached) upon construction of the CachedDataSet, rather at the 1034 (and cached) upon construction of the CachedDataSet, rather at the
1035 first access. 1035 first access.
1036 1036
1037 @todo when cache_all_upon_construction create mini-batches that are as 1037 @todo: when cache_all_upon_construction create mini-batches that are as
1038 large as possible but not so large as to fill up memory. 1038 large as possible but not so large as to fill up memory.
1039 1039
1040 @todo add disk-buffering capability, so that when the cache becomes too 1040 @todo: add disk-buffering capability, so that when the cache becomes too
1041 big for memory, we cache things on disk, trying to keep in memory only 1041 big for memory, we cache things on disk, trying to keep in memory only
1042 the record most likely to be accessed next. 1042 the record most likely to be accessed next.
1043 """ 1043 """
1044 def __init__(self,source_dataset,cache_all_upon_construction=False): 1044 def __init__(self,source_dataset,cache_all_upon_construction=False):
1045 self.source_dataset=source_dataset 1045 self.source_dataset=source_dataset
1090 else: 1090 else:
1091 return DataSet.__getitem__(self,i) 1091 return DataSet.__getitem__(self,i)
1092 1092
1093 class ApplyFunctionDataSet(DataSet): 1093 class ApplyFunctionDataSet(DataSet):
1094 """ 1094 """
1095 A dataset that contains as fields the results of applying a given function 1095 A L{DataSet} that contains as fields the results of applying a
1096 example-wise or minibatch-wise to all the fields of an input dataset. 1096 given function example-wise or minibatch-wise to all the fields of
1097 The output of the function should be an iterable (e.g. a list or a LookupList) 1097 an input dataset. The output of the function should be an iterable
1098 over the resulting values. 1098 (e.g. a list or a LookupList) over the resulting values.
1099 1099
1100 In minibatch mode, the function is expected to work on minibatches (takes 1100 In minibatch mode, the function is expected to work on minibatches
1101 a minibatch in input and returns a minibatch in output). More precisely, 1101 (takes a minibatch in input and returns a minibatch in output). More
1102 it means that each element of the input or output list should be iterable 1102 precisely, it means that each element of the input or output list
1103 and indexable over the individual example values (typically these 1103 should be iterable and indexable over the individual example values
1104 elements will be numpy arrays). All of the elements in the input and 1104 (typically these elements will be numpy arrays). All of the elements
1105 output lists should have the same length, which is the length of the 1105 in the input and output lists should have the same length, which is
1106 minibatch. 1106 the length of the minibatch.
1107 1107
1108 The function is applied each time an example or a minibatch is accessed. 1108 The function is applied each time an example or a minibatch is accessed.
1109 To avoid re-doing computation, wrap this dataset inside a CachedDataSet. 1109 To avoid re-doing computation, wrap this dataset inside a CachedDataSet.
1110 1110
1111 If the values_{h,v}stack functions are not provided, then 1111 If the values_{h,v}stack functions are not provided, then
1184 return ApplyFunctionSingleExampleIterator(self) 1184 return ApplyFunctionSingleExampleIterator(self)
1185 1185
1186 1186
1187 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): 1187 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None):
1188 """ 1188 """
1189 Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the 1189 Wraps an arbitrary L{DataSet} into one for supervised learning tasks
1190 user to define a set of fields as the 'input' field and a set of fields 1190 by forcing the user to define a set of fields as the 'input' field
1191 as the 'target' field. Optionally, a single weight_field can also be defined. 1191 and a set of fields as the 'target' field. Optionally, a single
1192 weight_field can also be defined.
1192 """ 1193 """
1193 args = ((input_fields,'input'),(output_fields,'target')) 1194 args = ((input_fields,'input'),(output_fields,'target'))
1194 if weight_field: args+=(([weight_field],'weight')) 1195 if weight_field: args+=(([weight_field],'weight'))
1195 return src_dataset.merge_fields(*args) 1196 return src_dataset.merge_fields(*args)
1196 1197