# HG changeset patch # User Joseph Turian # Date 1209767807 14400 # Node ID 2b6656b2ef52bf7205356bcb123ee8b7fc7d157f # Parent 5b699b31770a5e8153b93e2b882dbb54480493b8 Changed docs slightly diff -r 5b699b31770a -r 2b6656b2ef52 dataset.py --- a/dataset.py Fri May 02 18:19:35 2008 -0400 +++ b/dataset.py Fri May 02 18:36:47 2008 -0400 @@ -27,29 +27,29 @@ feasible or not recommanded on streams). To iterate over examples, there are several possibilities: - * for example in dataset([field1, field2,field3, ...]): - * for val1,val2,val3 in dataset([field1, field2,field3]): - * for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): - * for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N): - * for example in dataset: + - for example in dataset([field1, field2,field3, ...]): + - for val1,val2,val3 in dataset([field1, field2,field3]): + - for minibatch in dataset.minibatches([field1, field2, ...],minibatch_size=N): + - for mini1,mini2,mini3 in dataset.minibatches([field1, field2, field3], minibatch_size=N): + - for example in dataset:: print example['x'] - * for x,y,z in dataset: - Each of these is documented below. All of these iterators are expected - to provide, in addition to the usual 'next()' method, a 'next_index()' method - which returns a non-negative integer pointing to the position of the next - example that will be returned by 'next()' (or of the first example in the - next minibatch returned). This is important because these iterators - can wrap around the dataset in order to do multiple passes through it, - in possibly unregular ways if the minibatch size is not a divisor of the - dataset length. + - for x,y,z in dataset: + Each of these is documented below. All of these iterators are expected + to provide, in addition to the usual 'next()' method, a 'next_index()' method + which returns a non-negative integer pointing to the position of the next + example that will be returned by 'next()' (or of the first example in the + next minibatch returned). This is important because these iterators + can wrap around the dataset in order to do multiple passes through it, + in possibly unregular ways if the minibatch size is not a divisor of the + dataset length. To iterate over fields, one can do - * for field in dataset.fields(): + - for field in dataset.fields(): for field_value in field: # iterate over the values associated to that field for all the dataset examples - * for field in dataset(field1,field2,...).fields() to select a subset of fields - * for field in dataset.fields(field1,field2,...) to select a subset of fields + - for field in dataset(field1,field2,...).fields() to select a subset of fields + - for field in dataset.fields(field1,field2,...) to select a subset of fields and each of these fields is iterable over the examples: - * for field_examples in dataset.fields(): + - for field_examples in dataset.fields(): for example_value in field_examples: ... but when the dataset is a stream (unbounded length), it is not recommanded to do @@ -57,7 +57,7 @@ an unsynchronized ways. Hence the fields() method is illegal for streams, by default. The result of fields() is a DataSetFields object, which iterates over fields, and whose elements are iterable over examples. A DataSetFields object can - be turned back into a DataSet with its examples() method: + be turned back into a DataSet with its examples() method:: dataset2 = dataset1.fields().examples() and dataset2 should behave exactly like dataset1 (in fact by default dataset2==dataset1). @@ -72,20 +72,20 @@ of examples) can be extracted. These operations are not supported by default in the case of streams. - * dataset[:n] returns a dataset with the n first examples. + - dataset[:n] returns a dataset with the n first examples. - * dataset[i1:i2:s] returns a dataset with the examples i1,i1+s,...i2-s. + - dataset[i1:i2:s] returns a dataset with the examples i1,i1+s,...i2-s. - * dataset[i] returns an Example. + - dataset[i] returns an Example. - * dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. + - dataset[[i1,i2,...in]] returns a dataset with examples i1,i2,...in. - * dataset[fieldname] an iterable over the values of the field fieldname across - the dataset (the iterable is obtained by default by calling valuesVStack - over the values for individual examples). + - dataset[fieldname] an iterable over the values of the field fieldname across + the dataset (the iterable is obtained by default by calling valuesVStack + over the values for individual examples). - * dataset. returns the value of a property associated with - the name . The following properties should be supported: + - dataset. returns the value of a property associated with + the name . The following properties should be supported: - 'description': a textual description or name for the dataset - 'fieldtypes': a list of types (one per field) @@ -94,12 +94,12 @@ the following operations (with the same basic semantics as numpy.hstack and numpy.vstack): - * dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3]) + - dataset1 | dataset2 | dataset3 == dataset.hstack([dataset1,dataset2,dataset3]) creates a new dataset whose list of fields is the concatenation of the list of fields of the argument datasets. This only works if they all have the same length. - * dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) + - dataset1 & dataset2 & dataset3 == dataset.vstack([dataset1,dataset2,dataset3]) creates a new dataset that concatenates the examples from the argument datasets (and whose length is the sum of the length of the argument datasets). This only @@ -116,15 +116,15 @@ when key is a string (or more specifically, neither an integer, a slice, nor a list). A DataSet sub-class should always redefine the following methods: - * __len__ if it is not a stream - * fieldNames - * minibatches_nowrap (called by DataSet.minibatches()) - * valuesHStack - * valuesVStack + - __len__ if it is not a stream + - fieldNames + - minibatches_nowrap (called by DataSet.minibatches()) + - valuesHStack + - valuesVStack For efficiency of implementation, a sub-class might also want to redefine - * hasFields - * __getitem__ may not be feasible with some streams - * __iter__ + - hasFields + - __getitem__ may not be feasible with some streams + - __iter__ """ def __init__(self,description=None,fieldtypes=None): diff -r 5b699b31770a -r 2b6656b2ef52 filetensor.py --- a/filetensor.py Fri May 02 18:19:35 2008 -0400 +++ b/filetensor.py Fri May 02 18:36:47 2008 -0400 @@ -1,18 +1,18 @@ """ Read and write the matrix file format described at -http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html +U{http://www.cs.nyu.edu/~ylclab/data/norb-v1.0/index.html} The format is for dense tensors: - magic number indicating type and endianness - 4bytes - rank of tensor - int32 - dimensions - int32, int32, int32, ... - + - magic number indicating type and endianness - 4bytes + - rank of tensor - int32 + - dimensions - int32, int32, int32, ... + - The number of dimensions and rank is slightly tricky: - for scalar: rank=0, dimensions = [1, 1, 1] - for vector: rank=1, dimensions = [?, 1, 1] - for matrix: rank=2, dimensions = [?, ?, 1] + - for scalar: rank=0, dimensions = [1, 1, 1] + - for vector: rank=1, dimensions = [?, 1, 1] + - for matrix: rank=2, dimensions = [?, ?, 1] For rank >= 3, the number of dimensions matches the rank exactly.