#! /usr/bin/python
# -*- coding: utf-8 -*-
import bisect
import numpy as np
__all__ = [
'Dataset',
'IterableDataset',
'TensorDataset',
'ChainDataset',
'ConcatDataset',
'Subset',
'random_split',
]
[docs]class Dataset(object):
"""An abstract class to encapsulate methods and behaviors of datasets.
All datasets in map-style(dataset samples can be get by a given key) should be a subclass of 'tensorlayerx.dataflow.Dataset'.
ALl subclasses should implement following methods:
:code:`__getitem__`: get sample from dataset with a given index.
:code:`__len__`: return dataset sample number.
:code:`__add__`: concat two datasets
Examples
--------
With TensorLayerx
>>> from tensorlayerx.dataflow import Dataset
>>> class mnistdataset(Dataset):
>>> def __init__(self, data, label,transform):
>>> self.data = data
>>> self.label = label
>>> self.transform = transform
>>> def __getitem__(self, index):
>>> data = self.data[index].astype('float32')
>>> data = self.transform(data)
>>> label = self.label[index].astype('int64')
>>> return data, label
>>> def __len__(self):
>>> return len(self.data)
>>> train_dataset = mnistdataset(data = X_train, label = y_train ,transform = transform)
"""
def __init__(self):
pass
def __getitem__(self, idx):
raise NotImplementedError("'{}' not implement in class "\
"{}".format('__getitem__', self.__class__.__name__))
def __len__(self):
raise NotImplementedError("'{}' not implement in class "\
"{}".format('__len__', self.__class__.__name__))
def __add__(self, other):
return ConcatDataset([self, other])
[docs]class IterableDataset(object):
"""An abstract class to encapsulate methods and behaviors of iterable datasets.
All datasets in iterable-style (can only get sample one by one sequentially, likea Python iterator) should be a subclass of `tensorlayerx.dataflow.IterableDataset`.
All subclasses should implement following methods:
:code:`__iter__`: yield sample sequentially.
Examples
--------
With TensorLayerx
>>>#example 1:
>>> from tensorlayerx.dataflow import IterableDataset
>>> class mnistdataset(IterableDataset):
>>> def __init__(self, data, label,transform):
>>> self.data = data
>>> self.label = label
>>> self.transform = transform
>>> def __iter__(self):
>>> for i in range(len(self.data)):
>>> data = self.data[i].astype('float32')
>>> data = self.transform(data)
>>> label = self.label[i].astype('int64')
>>> yield data, label
>>> train_dataset = mnistdataset(data = X_train, label = y_train ,transform = transform)
>>>#example 2:
>>>iterable_dataset_1 = mnistdataset(data_1, label_1, transform_1)
>>>iterable_dataset_2 = mnistdataset(data_2, label_2, transform_2)
>>>new_iterable_dataset = iterable_dataset_1 + iterable_dataset_2
"""
def __init__(self):
pass
def __iter__(self):
raise NotImplementedError("'{}' not implement in class "\
"{}".format('__iter__', self.__class__.__name__))
def __add__(self, other):
return ChainDataset([self, other])
[docs]class TensorDataset(Dataset):
"""Generate a dataset from a list of tensors.
Each sample will be retrieved by indexing tensors along the first dimension.
Parameters
------------
*tensor : list or tuple of tensors
tensors that have the same size of the first dimension.
Examples
--------
With TensorLayerx
>>> import numpy as np
>>> import tensorlayerx as tlx
>>> data = np.random.random([10,224,224,3]).astype(np.float32)
>>> label = np.random.random((10,)).astype(np.int32)
>>> data = tlx.convert_to_tensor(data)
>>> label = tlx.convert_to_tensor(label)
>>> dataset = tlx.dataflow.TensorDataset([data, label])
>>> for i in range(len(dataset)):
>>> x, y = dataset[i]
"""
def __init__(self, *tensors):
super(TensorDataset, self).__init__()
assert all(
[tensor.shape[0] == tensors[0].shape[0] for tensor in tensors]
), "tensors not have same shape of the 1st dimension"
self.tensors = tensors
def __getitem__(self, item):
return tuple(tensor[item] for tensor in self.tensors)
def __len__(self):
return self.tensors[0].shape[0]
[docs]class ConcatDataset(Dataset):
"""Concat multiple datasets into a new dataset
Parameters
--------------
datasets : list or tuple
sequence of datasets to be concatenated
Examples
--------
With TensorLayerx
>>> import numpy as np
>>> from tensorlayerx.dataflow import Dataset, ConcatDataset
>>> class mnistdataset(Dataset):
>>> def __init__(self, data, label,transform):
>>> self.data = data
>>> self.label = label
>>> self.transform = transform
>>> def __getitem__(self, index):
>>> data = self.data[index].astype('float32')
>>> data = self.transform(data)
>>> label = self.label[index].astype('int64')
>>> return data, label
>>> def __len__(self):
>>> return len(self.data)
>>> train_dataset1 = mnistdataset(data = X_train1, label = y_train1 ,transform = transform1)
>>> train_dataset2 = mnistdataset(data = X_train2, label = y_train2 ,transform = transform2)
>>> train_dataset = ConcatDataset([train_dataset1, train_dataset2])
"""
@staticmethod
def cumsum(sequence):
r, s = [], 0
for e in sequence:
l = len(e)
r.append(l + s)
s += l
return r
def __init__(self, datasets):
super(ConcatDataset, self).__init__()
assert len(datasets) > 0, 'datasets should not be an empty iterable.'
self.datasets = list(datasets)
for dataset in self.datasets:
assert not isinstance(dataset, IterableDataset), "ConcatDataset can not support IterableDataset."
self.cumulative_sizes = self.cumsum(self.datasets)
def __len__(self):
return self.cumulative_sizes[-1]
def __getitem__(self, item):
dataset_id = bisect.bisect_right(self.cumulative_sizes, item)
if dataset_id == 0:
sample_id = item
else:
sample_id = item - self.cumulative_sizes[dataset_id - 1]
return self.datasets[dataset_id][sample_id]
[docs]class ChainDataset(IterableDataset):
"""A Dataset which chains multiple iterable-tyle datasets.
Parameters
------------
datasets : list or tuple
sequence of datasets to be chainned.
Examples
--------
With TensorLayerx
>>> import numpy as np
>>> from tensorlayerx.dataflow import IterableDataset, ChainDataset
>>> class mnistdataset(IterableDataset):
>>> def __init__(self, data, label):
>>> self.data = data
>>> self.label = label
>>> def __iter__(self):
>>> for i in range(len(self.data)):
>>> yield self.data[i] self.label[i]
>>> train_dataset1 = mnistdataset(data = X_train1, label = y_train1)
>>> train_dataset2 = mnistdataset(data = X_train2, label = y_train2)
>>> train_dataset = ChainDataset([train_dataset1, train_dataset2])
"""
def __init__(self, datasets):
super(ChainDataset, self).__init__()
assert len(datasets) > 0, 'datasets should not be an empty iterable.'
for dataset in datasets:
assert isinstance(dataset, IterableDataset), "ChainDataset only supports IterableDataset"
self.datasets = list(datasets)
def __iter__(self):
for dataset in self.datasets:
for x in dataset:
yield x
def __len__(self):
l = 0
for dataset in self.datasets:
l += len(dataset)
return l
[docs]class Subset(Dataset):
"""Subset of a dataset at specified indices.
Parameters
-------------
dataset : Dataset
The whole Dataset
indices : list or tuple
Indices in the whole set selected for subset
Examples
--------
With TensorLayerx
>>> import numpy as np
>>> from tensorlayerx.dataflow import Dataset, Subset
>>> class mnistdataset(Dataset):
>>> def __init__(self, data, label):
>>> self.data = data
>>> self.label = label
>>> def __iter__(self):
>>> for i in range(len(self.data)):
>>> yield self.data[i] self.label[i]
>>> train_dataset = mnistdataset(data = X_train, label = y_train)
>>> sub_dataset = Subset(train_dataset, indices=[1,2,3])
"""
def __init__(self, dataset, indices):
super(Subset, self).__init__()
assert not isinstance(dataset, IterableDataset), "Subset does not support IterableDataset."
self.dataset = dataset
self.indices = indices
def __getitem__(self, item):
return self.dataset[self.indices[item]]
def __len__(self):
return len(self.indices)
# Taken from python 3.5 docs
def _accumulate(iterable, fn=lambda x, y: x + y):
'Return running totals'
# _accumulate([1,2,3,4,5]) --> 1 3 6 10 15
# _accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120
it = iter(iterable)
try:
total = next(it)
except StopIteration:
return
yield total
for element in it:
total = fn(total, element)
yield total
[docs]def random_split(dataset, lengths):
"""Randomly split a dataset into non-overlapping new datasets of given lengths.
Parameters
----------
dataset : Dataset
dataset to be split
lengths : list or tuple
lengths of splits to be produced
Examples
--------
With TensorLayerx
>>> import numpy as np
>>> from tensorlayerx.dataflow import Dataset, Subset
>>> random_split(range(10), [3, 7])
"""
if sum(lengths) != len(dataset):
raise ValueError("Sum of input lengths does not equal the length of the input dataset!")
generator = np.random.default_rng()
indices = generator.permutation(sum(lengths))
return [Subset(dataset, indices[offset - length:offset]) for offset, length in zip(_accumulate(lengths), lengths)]