Source code for tensorlayerx.dataflow.sampler

#! /usr/bin/python
# -*- coding: utf-8 -*-

import numpy as np

__all__ = [
    'Sampler',
    'BatchSampler',
    'RandomSampler',
    'SequentialSampler',
    'WeightedRandomSampler',
    'SubsetRandomSampler',
]


[docs]class Sampler(object): """Base class for all Samplers. All subclasses should implement following methods: :code:`__iter__`: providing a way to iterate over indices of dataset element :code:`__len__`: the length of the returned iterators. Examples -------- With TensorLayerx >>> from tensorlayerx.dataflow import Sampler >>> class MySampler(Sampler): >>> def __init__(self, data): >>> self.data = data >>> def __iter__(self): >>> return iter(range(len(self.data_source))) >>> def __len__(self): >>> return len(self.data) """ def __init__(self): pass def __iter__(self): raise NotImplementedError
[docs]class BatchSampler(Sampler): """Wraps another sampler to yield a mini-batch of indices. Parameters ---------- sampler : Sampler Base sampler. batch_size : int Size of mini-batch drop_last : bool If ``True``, the sampler will drop the last batch if its size would be less than ``batch_size`` Examples -------- With TensorLayerx >>> from tensorlayerx.dataflow import BatchSampler, SequentialSampler >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False)) >>> #[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] >>> list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=True)) >>> #[[0, 1, 2], [3, 4, 5], [6, 7, 8]] """ def __init__(self, sampler=None, batch_size=1, drop_last=False): super(BatchSampler, self).__init__() if not isinstance(batch_size, int) or batch_size <= 0: raise ValueError("batch_size should be a positive integer value, but got {}.".format(type(batch_size))) if not isinstance(drop_last, bool): raise ValueError("drop_last should be a bool value, but got {}.".format(type(drop_last))) self.sampler = sampler self.batch_size = batch_size self.drop_last = drop_last def __iter__(self): batch_idxs = [] for index in self.sampler: batch_idxs.append(index) if len(batch_idxs) == self.batch_size: yield batch_idxs batch_idxs = [] if len(batch_idxs) > 0 and not self.drop_last: yield batch_idxs def __len__(self): num_samples = len(self.sampler) if self.drop_last: return num_samples // self.batch_size else: return (num_samples + self.batch_size - 1) // self.batch_size
[docs]class RandomSampler(Sampler): """Samples elements randomly. If without replacement, then sample from a shuffled dataset. If with replacement, then user can specify`num_samples` to draw. Parameters ------------- data : Dataset dataset to sample replacement : bool samples are drawn on-demand with replacement if ``True``, default=``False`` num_samples : int number of samples to draw, default=`len(dataset)`. This argument is supposed to be specified only when `replacement` is ``True``. generator : Generator Generator used in sampling. Default is None. Examples -------- With TensorLayerx >>> from tensorlayerx.dataflow import RandomSampler, Dataset >>> import numpy as np >>> class mydataset(Dataset): >>> def __init__(self): >>> self.data = [np.random.random((224,224,3)) for i in range(100)] >>> self.label = [np.random.randint(1, 10, (1,)) for i in range(100)] >>> def __getitem__(self, item): >>> x = self.data[item] >>> y = self.label[item] >>> return x, y >>> def __len__(self): >>> return len(self.data) >>> sampler = RandomSampler(data = mydataset()) """ def __init__(self, data, replacement=False, num_samples=None, generator=None): super(RandomSampler, self).__init__() self.data = data self.replacement = replacement self._num_samples = num_samples self.generator = generator if not isinstance(self.replacement, bool): raise TypeError("replacement should be a boolean value, but got " "replacement={}".format(self.replacement)) if self._num_samples is not None and not replacement: raise ValueError("When replacement is False, num_samples should not be specified.") if not isinstance(self.num_samples, int) or self.num_samples <= 0: raise ValueError( "num_samples should be a positive integer, " "but got num_samples={}".format(self.num_samples) ) @property def num_samples(self): if self._num_samples is None: return len(self.data) return self._num_samples def __iter__(self): n = len(self.data) if self.generator is None: generator = np.random.default_rng() if self.replacement: for index in generator.choice(np.arange(n), self.num_samples, replace=True).tolist(): yield index else: for index in generator.choice(np.arange(n), n, replace=False).tolist(): yield index else: for i in range(self.num_samples): try: index = next(self.generator) except StopIteration: return yield index def __len__(self): return self.num_samples
[docs]class SequentialSampler(Sampler): """Samples elements sequentially, always in the same order. Parameters ---------- data : Dataset dataset to sample Examples -------- With TensorLayerx >>> from tensorlayerx.dataflow import SequentialSampler, Dataset >>> import numpy as np >>> class mydataset(Dataset): >>> def __init__(self): >>> self.data = [np.random.random((224,224,3)) for i in range(100)] >>> self.label = [np.random.randint(1, 10, (1,)) for i in range(100)] >>> def __getitem__(self, item): >>> x = self.data[item] >>> y = self.label[item] >>> return x, y >>> def __len__(self): >>> return len(self.data) >>> sampler = SequentialSampler(data = mydataset()) """ def __init__(self, data): super(SequentialSampler, self).__init__() self.data = data def __iter__(self): return iter(range(len(self.data))) def __len__(self): return len(self.data)
[docs]class WeightedRandomSampler(Sampler): """Samples elements from ``[0,..,len(weights)-1]`` with given probabilities (weights). Parameters ----------- weights : list or tuple a sequence of weights, not necessary summing up to one num_samples : int number of samples to draw replacement : bool if ``True``, samples are drawn with replacement. If not, they are drawn without replacement, which means that when a sample index is drawn for a row, it cannot be drawn again for that row. Examples -------- With TensorLayerx >>> from tensorlayerx.dataflow import WeightedRandomSampler, Dataset >>> import numpy as np >>> sampler = list(WeightedRandomSampler(weights=[0.2,0.3,0.4,0.5,4.0], num_samples=5, replacement=True)) >>> #[4, 4, 1, 4, 4] >>> sampler = list(WeightedRandomSampler(weights=[0.2,0.3,0.4,0.5,0.6], num_samples=5, replacement=False)) >>> #[4, 1, 3, 0, 2] """ def __init__(self, weights, num_samples, replacement=True): super(WeightedRandomSampler, self).__init__() if not isinstance(weights, (list, tuple, np.ndarray)): raise ValueError("weights should be a list, tuple or numpy.ndarray, but got {}.".format(type(weights))) weights = np.asarray(weights, np.float) assert len(weights.shape) == 1, "weights should be a 1-D array" if np.any(weights < 0.0): raise ValueError("weights should be positive value.") if not np.sum(weights) > 0.0: raise ValueError("The sum of weights should be a positive value.") if not replacement: if np.sum(weights > 0.0) < num_samples: raise ValueError( "when replacement is False, the number of positive values in weights should be greater than numsamples." ) self.weights = weights / weights.sum() self.num_samples = num_samples self.replacement = replacement def __iter__(self): index = np.random.choice(len(self.weights), self.num_samples, self.replacement, self.weights) return iter(index.tolist()) def __len__(self): return self.num_samples
[docs]class SubsetRandomSampler(Sampler): """Samples elements randomly from a given list of indices, without replacement. Parameters ---------- indices : list or tuple sequence of indices """ def __init__(self, indices): super(SubsetRandomSampler, self).__init__() self.indices = indices def __iter__(self): return (self.indices[i] for i in np.random.permutation(len(self.indices))) def __len__(self): return len(self.indices)