Mercurial > pylearn
comparison dataset.py @ 167:4803cb76e26b
Updated documentation
author | Joseph Turian <turian@gmail.com> |
---|---|
date | Mon, 12 May 2008 18:51:42 -0400 |
parents | ee11ed427ba8 |
children | 895b4b60f5e8 |
comparison
equal
deleted
inserted
replaced
166:ee11ed427ba8 | 167:4803cb76e26b |
---|---|
528 return datasets[0] | 528 return datasets[0] |
529 return VStackedDataSet(datasets) | 529 return VStackedDataSet(datasets) |
530 | 530 |
531 class FieldsSubsetDataSet(DataSet): | 531 class FieldsSubsetDataSet(DataSet): |
532 """ | 532 """ |
533 A sub-class of DataSet that selects a subset of the fields. | 533 A sub-class of L{DataSet} that selects a subset of the fields. |
534 """ | 534 """ |
535 def __init__(self,src,fieldnames): | 535 def __init__(self,src,fieldnames): |
536 self.src=src | 536 self.src=src |
537 self.fieldnames=fieldnames | 537 self.fieldnames=fieldnames |
538 assert src.hasFields(*fieldnames) | 538 assert src.hasFields(*fieldnames) |
569 return FieldsSubsetDataSet(self.src[i],self.fieldnames) | 569 return FieldsSubsetDataSet(self.src[i],self.fieldnames) |
570 | 570 |
571 | 571 |
572 class DataSetFields(LookupList): | 572 class DataSetFields(LookupList): |
573 """ | 573 """ |
574 Although a DataSet iterates over examples (like rows of a matrix), an associated | 574 Although a L{DataSet} iterates over examples (like rows of a matrix), an associated |
575 DataSetFields iterates over fields (like columns of a matrix), and can be understood | 575 DataSetFields iterates over fields (like columns of a matrix), and can be understood |
576 as a transpose of the associated dataset. | 576 as a transpose of the associated dataset. |
577 | 577 |
578 To iterate over fields, one can do | 578 To iterate over fields, one can do |
579 * for fields in dataset.fields() | 579 * for fields in dataset.fields() |
636 return (self.examples() | other.examples()).fields() | 636 return (self.examples() | other.examples()).fields() |
637 | 637 |
638 | 638 |
639 class MinibatchDataSet(DataSet): | 639 class MinibatchDataSet(DataSet): |
640 """ | 640 """ |
641 Turn a LookupList of same-length (iterable) fields into an example-iterable dataset. | 641 Turn a L{LookupList} of same-length (iterable) fields into an example-iterable dataset. |
642 Each element of the lookup-list should be an iterable and sliceable, all of the same length. | 642 Each element of the lookup-list should be an iterable and sliceable, all of the same length. |
643 """ | 643 """ |
644 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, | 644 def __init__(self,fields_lookuplist,values_vstack=DataSet().valuesVStack, |
645 values_hstack=DataSet().valuesHStack): | 645 values_hstack=DataSet().valuesHStack): |
646 """ | 646 """ |
706 def valuesHStack(self,fieldnames,fieldvalues): | 706 def valuesHStack(self,fieldnames,fieldvalues): |
707 return self.values_hstack(fieldnames,fieldvalues) | 707 return self.values_hstack(fieldnames,fieldvalues) |
708 | 708 |
709 class HStackedDataSet(DataSet): | 709 class HStackedDataSet(DataSet): |
710 """ | 710 """ |
711 A DataSet that wraps several datasets and shows a view that includes all their fields, | 711 A L{DataSet} that wraps several datasets and shows a view that includes all their fields, |
712 i.e. whose list of fields is the concatenation of their lists of fields. | 712 i.e. whose list of fields is the concatenation of their lists of fields. |
713 | 713 |
714 If a field name is found in more than one of the datasets, then either an error is | 714 If a field name is found in more than one of the datasets, then either an error is |
715 raised or the fields are renamed (either by prefixing the __name__ attribute | 715 raised or the fields are renamed (either by prefixing the __name__ attribute |
716 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). | 716 of the dataset + ".", if it exists, or by suffixing the dataset index in the argument list). |
717 | 717 |
718 TODO: automatically detect a chain of stacked datasets due to A | B | C | D ... | 718 @todo: automatically detect a chain of stacked datasets due to A | B | C | D ... |
719 """ | 719 """ |
720 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): | 720 def __init__(self,datasets,accept_nonunique_names=False,description=None,field_types=None): |
721 DataSet.__init__(self,description,field_types) | 721 DataSet.__init__(self,description,field_types) |
722 self.datasets=datasets | 722 self.datasets=datasets |
723 self.accept_nonunique_names=accept_nonunique_names | 723 self.accept_nonunique_names=accept_nonunique_names |
805 """ | 805 """ |
806 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues) | 806 return self.datasets[self.fieldname2dataset[fieldnames[0]]].valuesHStack(fieldnames,fieldvalues) |
807 | 807 |
808 class VStackedDataSet(DataSet): | 808 class VStackedDataSet(DataSet): |
809 """ | 809 """ |
810 A DataSet that wraps several datasets and shows a view that includes all their examples, | 810 A L{DataSet} that wraps several datasets and shows a view that includes all their examples, |
811 in the order provided. This clearly assumes that they all have the same field names | 811 in the order provided. This clearly assumes that they all have the same field names |
812 and all (except possibly the last one) are of finite length. | 812 and all (except possibly the last one) are of finite length. |
813 | 813 |
814 TODO: automatically detect a chain of stacked datasets due to A + B + C + D ... | 814 @todo: automatically detect a chain of stacked datasets due to A + B + C + D ... |
815 """ | 815 """ |
816 def __init__(self,datasets): | 816 def __init__(self,datasets): |
817 self.datasets=datasets | 817 self.datasets=datasets |
818 self.length=0 | 818 self.length=0 |
819 self.index2dataset={} | 819 self.index2dataset={} |
1023 return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) | 1023 return ArrayDataSetIterator(self,fieldnames,minibatch_size,n_batches,offset) |
1024 | 1024 |
1025 | 1025 |
1026 class CachedDataSet(DataSet): | 1026 class CachedDataSet(DataSet): |
1027 """ | 1027 """ |
1028 Wrap a dataset whose values are computationally expensive to obtain | 1028 Wrap a L{DataSet} whose values are computationally expensive to obtain |
1029 (e.g. because they involve some computation, or disk access), | 1029 (e.g. because they involve some computation, or disk access), |
1030 so that repeated accesses to the same example are done cheaply, | 1030 so that repeated accesses to the same example are done cheaply, |
1031 by caching every example value that has been accessed at least once. | 1031 by caching every example value that has been accessed at least once. |
1032 | 1032 |
1033 Optionally, for finite-length dataset, all the values can be computed | 1033 Optionally, for finite-length dataset, all the values can be computed |
1034 (and cached) upon construction of the CachedDataSet, rather at the | 1034 (and cached) upon construction of the CachedDataSet, rather at the |
1035 first access. | 1035 first access. |
1036 | 1036 |
1037 @todo when cache_all_upon_construction create mini-batches that are as | 1037 @todo: when cache_all_upon_construction create mini-batches that are as |
1038 large as possible but not so large as to fill up memory. | 1038 large as possible but not so large as to fill up memory. |
1039 | 1039 |
1040 @todo add disk-buffering capability, so that when the cache becomes too | 1040 @todo: add disk-buffering capability, so that when the cache becomes too |
1041 big for memory, we cache things on disk, trying to keep in memory only | 1041 big for memory, we cache things on disk, trying to keep in memory only |
1042 the record most likely to be accessed next. | 1042 the record most likely to be accessed next. |
1043 """ | 1043 """ |
1044 def __init__(self,source_dataset,cache_all_upon_construction=False): | 1044 def __init__(self,source_dataset,cache_all_upon_construction=False): |
1045 self.source_dataset=source_dataset | 1045 self.source_dataset=source_dataset |
1090 else: | 1090 else: |
1091 return DataSet.__getitem__(self,i) | 1091 return DataSet.__getitem__(self,i) |
1092 | 1092 |
1093 class ApplyFunctionDataSet(DataSet): | 1093 class ApplyFunctionDataSet(DataSet): |
1094 """ | 1094 """ |
1095 A dataset that contains as fields the results of applying a given function | 1095 A L{DataSet} that contains as fields the results of applying a |
1096 example-wise or minibatch-wise to all the fields of an input dataset. | 1096 given function example-wise or minibatch-wise to all the fields of |
1097 The output of the function should be an iterable (e.g. a list or a LookupList) | 1097 an input dataset. The output of the function should be an iterable |
1098 over the resulting values. | 1098 (e.g. a list or a LookupList) over the resulting values. |
1099 | 1099 |
1100 In minibatch mode, the function is expected to work on minibatches (takes | 1100 In minibatch mode, the function is expected to work on minibatches |
1101 a minibatch in input and returns a minibatch in output). More precisely, | 1101 (takes a minibatch in input and returns a minibatch in output). More |
1102 it means that each element of the input or output list should be iterable | 1102 precisely, it means that each element of the input or output list |
1103 and indexable over the individual example values (typically these | 1103 should be iterable and indexable over the individual example values |
1104 elements will be numpy arrays). All of the elements in the input and | 1104 (typically these elements will be numpy arrays). All of the elements |
1105 output lists should have the same length, which is the length of the | 1105 in the input and output lists should have the same length, which is |
1106 minibatch. | 1106 the length of the minibatch. |
1107 | 1107 |
1108 The function is applied each time an example or a minibatch is accessed. | 1108 The function is applied each time an example or a minibatch is accessed. |
1109 To avoid re-doing computation, wrap this dataset inside a CachedDataSet. | 1109 To avoid re-doing computation, wrap this dataset inside a CachedDataSet. |
1110 | 1110 |
1111 If the values_{h,v}stack functions are not provided, then | 1111 If the values_{h,v}stack functions are not provided, then |
1184 return ApplyFunctionSingleExampleIterator(self) | 1184 return ApplyFunctionSingleExampleIterator(self) |
1185 | 1185 |
1186 | 1186 |
1187 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): | 1187 def supervised_learning_dataset(src_dataset,input_fields,target_fields,weight_field=None): |
1188 """ | 1188 """ |
1189 Wraps an arbitrary DataSet into one for supervised learning tasks by forcing the | 1189 Wraps an arbitrary L{DataSet} into one for supervised learning tasks |
1190 user to define a set of fields as the 'input' field and a set of fields | 1190 by forcing the user to define a set of fields as the 'input' field |
1191 as the 'target' field. Optionally, a single weight_field can also be defined. | 1191 and a set of fields as the 'target' field. Optionally, a single |
1192 weight_field can also be defined. | |
1192 """ | 1193 """ |
1193 args = ((input_fields,'input'),(output_fields,'target')) | 1194 args = ((input_fields,'input'),(output_fields,'target')) |
1194 if weight_field: args+=(([weight_field],'weight')) | 1195 if weight_field: args+=(([weight_field],'weight')) |
1195 return src_dataset.merge_fields(*args) | 1196 return src_dataset.merge_fields(*args) |
1196 | 1197 |