From c5eb488b463f57c01cbf7e6637a8bd7177c7b63d Mon Sep 17 00:00:00 2001 From: SigureMo Date: Thu, 27 Jul 2023 03:06:54 +0800 Subject: [PATCH 1/7] [xdoctest] reformat example code with google style in `paddle/io` --- python/paddle/io/dataloader/batch_sampler.py | 155 ++++--- python/paddle/io/dataloader/dataset.py | 443 ++++++++++--------- python/paddle/io/dataloader/sampler.py | 153 ++++--- python/paddle/io/dataloader/worker.py | 91 ++-- python/paddle/io/reader.py | 122 +++-- 5 files changed, 505 insertions(+), 459 deletions(-) diff --git a/python/paddle/io/dataloader/batch_sampler.py b/python/paddle/io/dataloader/batch_sampler.py index 190e9240900f8..b8349fd6d92ab 100644 --- a/python/paddle/io/dataloader/batch_sampler.py +++ b/python/paddle/io/dataloader/batch_sampler.py @@ -58,40 +58,37 @@ class BatchSampler(Sampler): .. code-block:: python - from paddle.io import RandomSampler, BatchSampler, Dataset - - # init with dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - bs = BatchSampler(dataset=RandomDataset(100), - shuffle=False, - batch_size=16, - drop_last=False) - - for batch_indices in bs: - print(batch_indices) - - # init with sampler - sampler = RandomSampler(RandomDataset(100)) - bs = BatchSampler(sampler=sampler, - batch_size=8, - drop_last=True) - - for batch_indices in bs: - print(batch_indices) - - - + >>> from paddle.io import RandomSampler, BatchSampler, Dataset + + >>> # init with dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> bs = BatchSampler(dataset=RandomDataset(100), + ... shuffle=False, + ... batch_size=16, + ... drop_last=False) + ... + >>> for batch_indices in bs: + ... print(batch_indices) + ... + >>> # init with sampler + >>> sampler = RandomSampler(RandomDataset(100)) + >>> bs = BatchSampler(sampler=sampler, + ... batch_size=8, + ... drop_last=True) + ... + >>> for batch_indices in bs: + ... print(batch_indices) """ def __init__( @@ -203,29 +200,29 @@ class DistributedBatchSampler(BatchSampler): Examples: .. code-block:: python - import numpy as np - - from paddle.io import Dataset, DistributedBatchSampler - - # init with dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(100) - sampler = DistributedBatchSampler(dataset, batch_size=64) - - for data in sampler: - # do something - break + >>> import numpy as np + + >>> from paddle.io import Dataset, DistributedBatchSampler + + >>> # init with dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = RandomDataset(100) + >>> sampler = DistributedBatchSampler(dataset, batch_size=64) + + >>> for data in sampler: + ... # do something + ... break """ def __init__( @@ -339,27 +336,27 @@ def set_epoch(self, epoch): Examples: .. code-block:: python - import numpy as np - - from paddle.io import Dataset, DistributedBatchSampler - - # init with dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(100) - sampler = DistributedBatchSampler(dataset, batch_size=64) - - for epoch in range(10): - sampler.set_epoch(epoch) + >>> import numpy as np + + >>> from paddle.io import Dataset, DistributedBatchSampler + + >>> # init with dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = RandomDataset(100) + >>> sampler = DistributedBatchSampler(dataset, batch_size=64) + + >>> for epoch in range(10): + ... sampler.set_epoch(epoch) """ self.epoch = epoch diff --git a/python/paddle/io/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py index 3e0458ae9b700..5253c647b1db3 100755 --- a/python/paddle/io/dataloader/dataset.py +++ b/python/paddle/io/dataloader/dataset.py @@ -37,26 +37,25 @@ class Dataset: .. code-block:: python - import numpy as np - from paddle.io import Dataset - - # define a random dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(10) - for i in range(len(dataset)): - print(dataset[i]) - + >>> import numpy as np + >>> from paddle.io import Dataset + + >>> # define a random dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = RandomDataset(10) + >>> for i in range(len(dataset)): + ... print(dataset[i]) """ def __init__(self): @@ -95,23 +94,23 @@ class IterableDataset(Dataset): .. code-block:: python :name: code-example1 - import numpy as np - from paddle.io import IterableDataset - - # define a random dataset - class RandomDataset(IterableDataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __iter__(self): - for i in range(self.num_samples): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - yield image, label - - dataset = RandomDataset(10) - for img, lbl in dataset: - print(img, lbl) + >>> import numpy as np + >>> from paddle.io import IterableDataset + + >>> # define a random dataset + >>> class RandomDataset(IterableDataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __iter__(self): + ... for i in range(self.num_samples): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... yield image, label + ... + >>> dataset = RandomDataset(10) + >>> for img, lbl in dataset: + ... print(img, lbl) When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and will yield whole dataset samples, which means samples in dataset will be repeated in @@ -125,87 +124,113 @@ def __iter__(self): .. code-block:: python :name: code-example2 - import math - import paddle - import numpy as np - from paddle.io import IterableDataset, DataLoader, get_worker_info - - class SplitedIterableDataset(IterableDataset): - def __init__(self, start, end): - self.start = start - self.end = end - - def __iter__(self): - worker_info = get_worker_info() - if worker_info is None: - iter_start = self.start - iter_end = self.end - else: - per_worker = int( - math.ceil((self.end - self.start) / float( - worker_info.num_workers))) - worker_id = worker_info.id - iter_start = self.start + worker_id * per_worker - iter_end = min(iter_start + per_worker, self.end) - - for i in range(iter_start, iter_end): - yield np.array([i]) - - dataset = SplitedIterableDataset(start=2, end=9) - dataloader = DataLoader( - dataset, - num_workers=2, - batch_size=1, - drop_last=True) - - for data in dataloader: - print(data) - # outputs: [2, 5, 3, 6, 4, 7] + >>> import math + >>> import paddle + >>> import numpy as np + >>> from paddle.io import IterableDataset, DataLoader, get_worker_info + + >>> class SplitedIterableDataset(IterableDataset): + ... def __init__(self, start, end): + ... self.start = start + ... self.end = end + ... + ... def __iter__(self): + ... worker_info = get_worker_info() + ... if worker_info is None: + ... iter_start = self.start + ... iter_end = self.end + ... else: + ... per_worker = int( + ... math.ceil((self.end - self.start) / float( + ... worker_info.num_workers))) + ... worker_id = worker_info.id + ... iter_start = self.start + worker_id * per_worker + ... iter_end = min(iter_start + per_worker, self.end) + ... + ... for i in range(iter_start, iter_end): + ... yield np.array([i]) + ... + >>> dataset = SplitedIterableDataset(start=2, end=9) + >>> dataloader = DataLoader( + ... dataset, + ... num_workers=2, + ... batch_size=1, + ... drop_last=True) + ... + >>> for data in dataloader: + ... print(data) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[2]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[3]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[4]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[5]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[6]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[7]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[8]]) splitting data copy in each worker by :code:`worker_init_fn` .. code-block:: python :name: code-example3 - import math - import paddle - import numpy as np - from paddle.io import IterableDataset, DataLoader, get_worker_info - - class RangeIterableDataset(IterableDataset): - def __init__(self, start, end): - self.start = start - self.end = end - - def __iter__(self): - for i in range(self.start, self.end): - yield np.array([i]) - - dataset = RangeIterableDataset(start=2, end=9) - - def worker_init_fn(worker_id): - worker_info = get_worker_info() - - dataset = worker_info.dataset - start = dataset.start - end = dataset.end - num_per_worker = int( - math.ceil((end - start) / float(worker_info.num_workers))) - - worker_id = worker_info.id - dataset.start = start + worker_id * num_per_worker - dataset.end = min(dataset.start + num_per_worker, end) - - dataloader = DataLoader( - dataset, - num_workers=2, - batch_size=1, - drop_last=True, - worker_init_fn=worker_init_fn) - - for data in dataloader: - print(data) - # outputs: [2, 5, 3, 6, 4, 7] + >>> import math + >>> import paddle + >>> import numpy as np + >>> from paddle.io import IterableDataset, DataLoader, get_worker_info + + >>> class RangeIterableDataset(IterableDataset): + ... def __init__(self, start, end): + ... self.start = start + ... self.end = end + ... + ... def __iter__(self): + ... for i in range(self.start, self.end): + ... yield np.array([i]) + ... + >>> dataset = RangeIterableDataset(start=2, end=9) + + >>> def worker_init_fn(worker_id): + ... worker_info = get_worker_info() + ... + ... dataset = worker_info.dataset + ... start = dataset.start + ... end = dataset.end + ... num_per_worker = int( + ... math.ceil((end - start) / float(worker_info.num_workers))) + ... + ... worker_id = worker_info.id + ... dataset.start = start + worker_id * num_per_worker + ... dataset.end = min(dataset.start + num_per_worker, end) + ... + >>> dataloader = DataLoader( + ... dataset, + ... num_workers=2, + ... batch_size=1, + ... drop_last=True, + ... worker_init_fn=worker_init_fn) + ... + >>> for data in dataloader: + ... print(data) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[2]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[3]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[4]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[5]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[6]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[7]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[8]]) """ @@ -249,22 +274,21 @@ class TensorDataset(Dataset): .. code-block:: python - import numpy as np - import paddle - from paddle.io import TensorDataset - + >>> import numpy as np + >>> import paddle + >>> from paddle.io import TensorDataset - input_np = np.random.random([2, 3, 4]).astype('float32') - input = paddle.to_tensor(input_np) - label_np = np.random.random([2, 1]).astype('int32') - label = paddle.to_tensor(label_np) - dataset = TensorDataset([input, label]) + >>> input_np = np.random.random([2, 3, 4]).astype('float32') + >>> input = paddle.to_tensor(input_np) + >>> label_np = np.random.random([2, 1]).astype('int32') + >>> label = paddle.to_tensor(label_np) - for i in range(len(dataset)): - input, label = dataset[i] - print(input, label) + >>> dataset = TensorDataset([input, label]) + >>> for i in range(len(dataset)): + ... input, label = dataset[i] + ... print(input, label) """ def __init__(self, tensors): @@ -309,32 +333,31 @@ class ComposeDataset(Dataset): .. code-block:: python - import numpy as np - import paddle - from paddle.io import Dataset, ComposeDataset - - - # define a random dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([32]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)]) - for i in range(len(dataset)): - image1, label1, image2, label2 = dataset[i] - print(image1) - print(label1) - print(image2) - print(label2) - + >>> import numpy as np + >>> import paddle + >>> from paddle.io import Dataset, ComposeDataset + + + >>> # define a random dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([32]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)]) + >>> for i in range(len(dataset)): + ... image1, label1, image2, label2 = dataset[i] + ... print(image1) + ... print(label1) + ... print(image2) + ... print(label2) """ def __init__(self, datasets): @@ -379,26 +402,25 @@ class ChainDataset(IterableDataset): .. code-block:: python - import numpy as np - import paddle - from paddle.io import IterableDataset, ChainDataset - - - # define a random dataset - class RandomDataset(IterableDataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __iter__(self): - for i in range(10): - image = np.random.random([32]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - yield image, label - - dataset = ChainDataset([RandomDataset(10), RandomDataset(10)]) - for image, label in iter(dataset): - print(image, label) - + >>> import numpy as np + >>> import paddle + >>> from paddle.io import IterableDataset, ChainDataset + + + >>> # define a random dataset + >>> class RandomDataset(IterableDataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __iter__(self): + ... for i in range(10): + ... image = np.random.random([32]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... yield image, label + ... + >>> dataset = ChainDataset([RandomDataset(10), RandomDataset(10)]) + >>> for image, label in iter(dataset): + ... print(image, label) """ def __init__(self, datasets): @@ -430,18 +452,18 @@ class Subset(Dataset): .. code-block:: python - import paddle - from paddle.io import Subset + >>> import paddle + >>> from paddle.io import Subset - # example 1: - a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2]) - print(list(a)) - # [1, 3] + >>> # example 1: + >>> a = paddle.io.Subset(dataset=range(1, 4), indices=[0, 2]) + >>> print(list(a)) + [1, 3] - # example 2: - b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1]) - print(list(b)) - # [2, 2] + >>> # example 2: + >>> b = paddle.io.Subset(dataset=range(1, 4), indices=[1, 1]) + >>> print(list(b)) + [2, 2] """ def __init__(self, dataset, indices): @@ -472,31 +494,34 @@ def random_split(dataset, lengths, generator=None): .. code-block:: python - import paddle - from paddle.io import random_split - - a_list = paddle.io.random_split(range(10), [3, 7]) - print(len(a_list)) - # 2 - - for idx, v in enumerate(a_list[0]): - print(idx, v) - - # output of the first subset - # 0 1 - # 1 3 - # 2 9 - - for idx, v in enumerate(a_list[1]): - print(idx, v) - # output of the second subset - # 0 5 - # 1 7 - # 2 8 - # 3 6 - # 4 0 - # 5 2 - # 6 4 + >>> import paddle + >>> from paddle.io import random_split + + >>> a_list = paddle.io.random_split(range(10), [3, 7]) + >>> print(len(a_list)) + 2 + + >>> # output of the first subset + >>> for idx, v in enumerate(a_list[0]): + ... print(idx, v) + >>> # doctest: +SKIP + 0 1 + 1 3 + 2 9 + >>> # doctest: -SKIP + + >>> # output of the second subset + >>> for idx, v in enumerate(a_list[1]): + ... print(idx, v) + >>> # doctest: +SKIP + 0 5 + 1 7 + 2 8 + 3 6 + 4 0 + 5 2 + 6 4 + >>> # doctest: -SKIP """ # Cannot verify that dataset is Sized if sum(lengths) != len(dataset): # type: ignore @@ -528,8 +553,12 @@ def _accumulate(iterable, fn=lambda x, y: x + y): .. code-block:: python - _accumulate([1,2,3,4,5]) --> 1 3 6 10 15 - _accumulate([1,2,3,4,5], operator.mul) --> 1 2 6 24 120 + >>> list(_accumulate([1, 2, 3, 4, 5])) + [1, 3, 6, 10, 15] + + >>> import operator + >>> list(_accumulate([1, 2, 3, 4, 5], operator.mul)) + [1, 2, 6, 24, 120] """ it = iter(iterable) diff --git a/python/paddle/io/dataloader/sampler.py b/python/paddle/io/dataloader/sampler.py index aa8a4e649c76c..e85f51510b704 100644 --- a/python/paddle/io/dataloader/sampler.py +++ b/python/paddle/io/dataloader/sampler.py @@ -44,34 +44,39 @@ class Sampler: .. code-block:: python - from paddle.io import Dataset, Sampler - - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - class MySampler(Sampler): - def __init__(self, data_source): - self.data_source = data_source - - def __iter__(self): - return iter(range(len(self.data_source))) - - def __len__(self): - return len(self.data_source) - - sampler = MySampler(data_source=RandomDataset(100)) - - for index in sampler: - print(index) + >>> from paddle.io import Dataset, Sampler + + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> class MySampler(Sampler): + ... def __init__(self, data_source): + ... self.data_source = data_source + ... + ... def __iter__(self): + ... return iter(range(len(self.data_source))) + ... + ... def __len__(self): + ... return len(self.data_source) + ... + >>> sampler = MySampler(data_source=RandomDataset(100)) + + >>> for index in sampler: + ... print(index) + 0 + 1 + 2 + ... + 99 see `paddle.io.BatchSampler` see `paddle.io.DataLoader` @@ -105,24 +110,29 @@ class SequenceSampler(Sampler): .. code-block:: python - from paddle.io import Dataset, SequenceSampler - - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - sampler = SequenceSampler(data_source=RandomDataset(100)) - - for index in sampler: - print(index) + >>> from paddle.io import Dataset, SequenceSampler + + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> sampler = SequenceSampler(data_source=RandomDataset(100)) + + >>> for index in sampler: + ... print(index) + 0 + 1 + 2 + ... + 99 see `paddle.io.Sampler` """ @@ -160,25 +170,24 @@ class RandomSampler(Sampler): .. code-block:: python - from paddle.io import Dataset, RandomSampler - - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([784]).astype('float32') - label = np.random.randint(0, 9, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - sampler = RandomSampler(data_source=RandomDataset(100)) - - for index in sampler: - print(index) - + >>> from paddle.io import Dataset, RandomSampler + + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([784]).astype('float32') + ... label = np.random.randint(0, 9, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> sampler = RandomSampler(data_source=RandomDataset(100)) + + >>> for index in sampler: + ... print(index) """ def __init__( @@ -288,14 +297,14 @@ class WeightedRandomSampler(Sampler): .. code-block:: python - from paddle.io import WeightedRandomSampler - - sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2], - num_samples=5, - replacement=True) + >>> from paddle.io import WeightedRandomSampler - for index in sampler: - print(index) + >>> sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2], + ... num_samples=5, + ... replacement=True) + ... + >>> for index in sampler: + ... print(index) """ def __init__(self, weights, num_samples, replacement=True): diff --git a/python/paddle/io/dataloader/worker.py b/python/paddle/io/dataloader/worker.py index 5eeeb849fc025..6c9bdf3de0d9a 100644 --- a/python/paddle/io/dataloader/worker.py +++ b/python/paddle/io/dataloader/worker.py @@ -94,51 +94,64 @@ def get_worker_info(): Returns: WorkerInfo: an instance of WorkerInfo which contains fields above. - .. note:: + Notes: For more usage and examples, please see :code:`paddle.io.IterableDataset` Example: .. code-block:: python - import math - import paddle - import numpy as np - from paddle.io import IterableDataset, DataLoader, get_worker_info - - class SplitedIterableDataset(IterableDataset): - def __init__(self, start, end): - self.start = start - self.end = end - - def __iter__(self): - worker_info = get_worker_info() - if worker_info is None: - iter_start = self.start - iter_end = self.end - else: - per_worker = int( - math.ceil((self.end - self.start) / float( - worker_info.num_workers))) - worker_id = worker_info.id - iter_start = self.start + worker_id * per_worker - iter_end = min(iter_start + per_worker, self.end) - - for i in range(iter_start, iter_end): - yield np.array([i]) - - place = paddle.CPUPlace() - dataset = SplitedIterableDataset(start=2, end=9) - dataloader = DataLoader( - dataset, - places=place, - num_workers=2, - batch_size=1, - drop_last=True) - - for data in dataloader: - print(data) - # outputs: [2, 5, 3, 6, 4, 7] + >>> import math + >>> import paddle + >>> import numpy as np + >>> from paddle.io import IterableDataset, DataLoader, get_worker_info + + >>> class SplitedIterableDataset(IterableDataset): + ... def __init__(self, start, end): + ... self.start = start + ... self.end = end + ... + ... def __iter__(self): + ... worker_info = get_worker_info() + ... if worker_info is None: + ... iter_start = self.start + ... iter_end = self.end + ... else: + ... per_worker = int( + ... math.ceil((self.end - self.start) / float( + ... worker_info.num_workers))) + ... worker_id = worker_info.id + ... iter_start = self.start + worker_id * per_worker + ... iter_end = min(iter_start + per_worker, self.end) + ... + ... for i in range(iter_start, iter_end): + ... yield np.array([i]) + ... + >>> place = paddle.CPUPlace() + >>> dataset = SplitedIterableDataset(start=2, end=9) + >>> dataloader = DataLoader( + ... dataset, + ... places=place, + ... num_workers=2, + ... batch_size=1, + ... drop_last=True) + ... + >>> for data in dataloader: + ... print(data) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[2]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[3]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[4]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[5]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[6]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[7]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[8]]) """ return _worker_info diff --git a/python/paddle/io/reader.py b/python/paddle/io/reader.py index 861d1253dcfb9..1ee61fa6f2f88 100644 --- a/python/paddle/io/reader.py +++ b/python/paddle/io/reader.py @@ -234,7 +234,7 @@ class DataLoader: For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` - .. note:: + Notes: GPU tensor operation is not supported in subprocess currently, please don't use GPU tensor operations in pipeline which will be performed in subprocess, such as dataset transforms, collte_fn, @@ -250,7 +250,7 @@ class DataLoader: :attr:`collate_fn` or :attr:`default_collate_fn`. - .. note:: + Notes: When automatic batching is disabled, :attr:`default_collate_fn` will do nothing to data from dataset. @@ -321,68 +321,66 @@ class DataLoader: .. code-block:: python - import numpy as np - - import paddle - import paddle.nn as nn - import paddle.nn.functional as F - from paddle.io import Dataset, BatchSampler, DataLoader - - BATCH_NUM = 20 - BATCH_SIZE = 16 - EPOCH_NUM = 4 - - IMAGE_SIZE = 784 - CLASS_NUM = 10 - - # define a random dataset - class RandomDataset(Dataset): - def __init__(self, num_samples): - self.num_samples = num_samples - - def __getitem__(self, idx): - image = np.random.random([IMAGE_SIZE]).astype('float32') - label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') - return image, label - - def __len__(self): - return self.num_samples - - dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) - - class SimpleNet(nn.Layer): - def __init__(self): - super().__init__() - self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM) - - def forward(self, image, label=None): - return self.fc(image) - - simple_net = SimpleNet() - opt = paddle.optimizer.SGD(learning_rate=1e-3, - parameters=simple_net.parameters()) - - loader = DataLoader(dataset, - batch_size=BATCH_SIZE, - shuffle=True, - drop_last=True, - num_workers=2) - - for e in range(EPOCH_NUM): - for i, (image, label) in enumerate(loader()): - out = simple_net(image) - loss = F.cross_entropy(out, label) - avg_loss = paddle.mean(loss) - avg_loss.backward() - opt.minimize(avg_loss) - simple_net.clear_gradients() - print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) - - - .. note:: + >>> import numpy as np + + >>> import paddle + >>> import paddle.nn as nn + >>> import paddle.nn.functional as F + >>> from paddle.io import Dataset, BatchSampler, DataLoader + + >>> BATCH_NUM = 20 + >>> BATCH_SIZE = 16 + >>> EPOCH_NUM = 4 + + >>> IMAGE_SIZE = 784 + >>> CLASS_NUM = 10 + + >>> # define a random dataset + >>> class RandomDataset(Dataset): + ... def __init__(self, num_samples): + ... self.num_samples = num_samples + ... + ... def __getitem__(self, idx): + ... image = np.random.random([IMAGE_SIZE]).astype('float32') + ... label = np.random.randint(0, CLASS_NUM - 1, (1, )).astype('int64') + ... return image, label + ... + ... def __len__(self): + ... return self.num_samples + ... + >>> dataset = RandomDataset(BATCH_NUM * BATCH_SIZE) + + >>> class SimpleNet(nn.Layer): + ... def __init__(self): + ... super().__init__() + ... self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM) + ... + ... def forward(self, image, label=None): + ... return self.fc(image) + ... + >>> simple_net = SimpleNet() + >>> opt = paddle.optimizer.SGD(learning_rate=1e-3, + ... parameters=simple_net.parameters()) + ... + >>> loader = DataLoader(dataset, + ... batch_size=BATCH_SIZE, + ... shuffle=True, + ... drop_last=True, + ... num_workers=2) + ... + >>> for e in range(EPOCH_NUM): + ... for i, (image, label) in enumerate(loader()): + ... out = simple_net(image) + ... loss = F.cross_entropy(out, label) + ... avg_loss = paddle.mean(loss) + ... avg_loss.backward() + ... opt.minimize(avg_loss) + ... simple_net.clear_gradients() + ... print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy()))) + + Notes: For reading iterable dataset with multiprocess Dataloader, please see :code:`paddle.io.IterableDataset` - """ def __init__( From e8a6b31596152321e00a04c42b717f30d75e0ad3 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Thu, 27 Jul 2023 03:09:40 +0800 Subject: [PATCH 2/7] preview, test=docs_preview From fbce310a1977cc16d982ead7d4df1240579b81b7 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Tue, 1 Aug 2023 21:20:08 +0800 Subject: [PATCH 3/7] update example code, test=docs_preview --- python/paddle/io/dataloader/batch_sampler.py | 7 +++ python/paddle/io/dataloader/dataset.py | 48 ++++++++++---------- python/paddle/io/dataloader/sampler.py | 23 ++++++++-- 3 files changed, 49 insertions(+), 29 deletions(-) diff --git a/python/paddle/io/dataloader/batch_sampler.py b/python/paddle/io/dataloader/batch_sampler.py index b8349fd6d92ab..78c93151a390d 100644 --- a/python/paddle/io/dataloader/batch_sampler.py +++ b/python/paddle/io/dataloader/batch_sampler.py @@ -58,8 +58,10 @@ class BatchSampler(Sampler): .. code-block:: python + >>> import numpy as np >>> from paddle.io import RandomSampler, BatchSampler, Dataset + >>> np.random.seed(2023) >>> # init with dataset >>> class RandomDataset(Dataset): ... def __init__(self, num_samples): @@ -80,7 +82,9 @@ class BatchSampler(Sampler): ... >>> for batch_indices in bs: ... print(batch_indices) + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] ... + [96, 97, 98, 99] >>> # init with sampler >>> sampler = RandomSampler(RandomDataset(100)) >>> bs = BatchSampler(sampler=sampler, @@ -89,6 +93,9 @@ class BatchSampler(Sampler): ... >>> for batch_indices in bs: ... print(batch_indices) + [56, 12, 68, 0, 82, 66, 91, 44] + ... + [53, 17, 22, 86, 52, 3, 92, 33] """ def __init__( diff --git a/python/paddle/io/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py index 5253c647b1db3..16a5f0379523f 100755 --- a/python/paddle/io/dataloader/dataset.py +++ b/python/paddle/io/dataloader/dataset.py @@ -55,7 +55,8 @@ class Dataset: ... >>> dataset = RandomDataset(10) >>> for i in range(len(dataset)): - ... print(dataset[i]) + ... image, label = dataset[i] + ... # do something """ def __init__(self): @@ -109,8 +110,9 @@ class IterableDataset(Dataset): ... yield image, label ... >>> dataset = RandomDataset(10) - >>> for img, lbl in dataset: - ... print(img, lbl) + >>> for img, label in dataset: + ... # do something + ... ... When :attr:`num_workers > 0`, each worker has a different copy of the dataset object and will yield whole dataset samples, which means samples in dataset will be repeated in @@ -158,7 +160,7 @@ class IterableDataset(Dataset): ... drop_last=True) ... >>> for data in dataloader: - ... print(data) + ... print(data) # doctest: +SKIP Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[2]]) Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, @@ -216,7 +218,7 @@ class IterableDataset(Dataset): ... worker_init_fn=worker_init_fn) ... >>> for data in dataloader: - ... print(data) + ... print(data) # doctest: +SKIP Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[2]]) Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, @@ -288,7 +290,7 @@ class TensorDataset(Dataset): >>> for i in range(len(dataset)): ... input, label = dataset[i] - ... print(input, label) + ... # do something """ def __init__(self, tensors): @@ -354,10 +356,7 @@ class ComposeDataset(Dataset): >>> dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)]) >>> for i in range(len(dataset)): ... image1, label1, image2, label2 = dataset[i] - ... print(image1) - ... print(label1) - ... print(image2) - ... print(label2) + ... # do something """ def __init__(self, datasets): @@ -420,7 +419,9 @@ class ChainDataset(IterableDataset): ... >>> dataset = ChainDataset([RandomDataset(10), RandomDataset(10)]) >>> for image, label in iter(dataset): - ... print(image, label) + ... # do something + ... ... + """ def __init__(self, datasets): @@ -497,6 +498,7 @@ def random_split(dataset, lengths, generator=None): >>> import paddle >>> from paddle.io import random_split + >>> paddle.seed(2023) >>> a_list = paddle.io.random_split(range(10), [3, 7]) >>> print(len(a_list)) 2 @@ -504,24 +506,20 @@ def random_split(dataset, lengths, generator=None): >>> # output of the first subset >>> for idx, v in enumerate(a_list[0]): ... print(idx, v) - >>> # doctest: +SKIP - 0 1 - 1 3 - 2 9 - >>> # doctest: -SKIP + 0 8 + 1 2 + 2 5 >>> # output of the second subset >>> for idx, v in enumerate(a_list[1]): ... print(idx, v) - >>> # doctest: +SKIP - 0 5 - 1 7 - 2 8 - 3 6 - 4 0 - 5 2 - 6 4 - >>> # doctest: -SKIP + 0 9 + 1 6 + 2 3 + 3 4 + 4 1 + 5 0 + 6 7 """ # Cannot verify that dataset is Sized if sum(lengths) != len(dataset): # type: ignore diff --git a/python/paddle/io/dataloader/sampler.py b/python/paddle/io/dataloader/sampler.py index e85f51510b704..d26316ecc0eb7 100644 --- a/python/paddle/io/dataloader/sampler.py +++ b/python/paddle/io/dataloader/sampler.py @@ -170,8 +170,10 @@ class RandomSampler(Sampler): .. code-block:: python + >>> import numpy as np >>> from paddle.io import Dataset, RandomSampler + >>> np.random.seed(2023) >>> class RandomDataset(Dataset): ... def __init__(self, num_samples): ... self.num_samples = num_samples @@ -188,6 +190,11 @@ class RandomSampler(Sampler): >>> for index in sampler: ... print(index) + 56 + 12 + 68 + ... + 87 """ def __init__( @@ -297,14 +304,22 @@ class WeightedRandomSampler(Sampler): .. code-block:: python + >>> import numpy as np >>> from paddle.io import WeightedRandomSampler - >>> sampler = WeightedRandomSampler(weights=[0.1, 0.3, 0.5, 0.7, 0.2], - ... num_samples=5, - ... replacement=True) - ... + >>> np.random.seed(2023) + >>> sampler = WeightedRandomSampler( + ... weights=[0.1, 0.3, 0.5, 0.7, 0.2], + ... num_samples=5, + ... replacement=True + ... ) >>> for index in sampler: ... print(index) + 2 + 4 + 3 + 1 + 1 """ def __init__(self, weights, num_samples, replacement=True): From 71c80e8feadcb7e5d021a24fa7a8462dbe386cc3 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 2 Aug 2023 09:57:03 +0800 Subject: [PATCH 4/7] update output, test=docs_preview --- python/paddle/io/dataloader/dataset.py | 18 +++++++++--------- python/paddle/io/dataloader/worker.py | 14 ++++++-------- python/paddle/io/multiprocess_utils.py | 2 -- python/paddle/io/reader.py | 2 -- 4 files changed, 15 insertions(+), 21 deletions(-) diff --git a/python/paddle/io/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py index 16a5f0379523f..a01da3292e37e 100755 --- a/python/paddle/io/dataloader/dataset.py +++ b/python/paddle/io/dataloader/dataset.py @@ -506,20 +506,20 @@ def random_split(dataset, lengths, generator=None): >>> # output of the first subset >>> for idx, v in enumerate(a_list[0]): ... print(idx, v) - 0 8 - 1 2 + 0 7 + 1 6 2 5 >>> # output of the second subset >>> for idx, v in enumerate(a_list[1]): ... print(idx, v) - 0 9 - 1 6 - 2 3 - 3 4 - 4 1 - 5 0 - 6 7 + 0 1 + 1 9 + 2 4 + 3 2 + 4 0 + 5 3 + 6 8 """ # Cannot verify that dataset is Sized if sum(lengths) != len(dataset): # type: ignore diff --git a/python/paddle/io/dataloader/worker.py b/python/paddle/io/dataloader/worker.py index 6c9bdf3de0d9a..6e214dbf63102 100644 --- a/python/paddle/io/dataloader/worker.py +++ b/python/paddle/io/dataloader/worker.py @@ -13,8 +13,6 @@ # limitations under the License. import os - -# NOTE: queue has a different name in python2 and python3 import queue import sys import traceback @@ -141,17 +139,17 @@ def get_worker_info(): Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[2]]) Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, - [[3]]) - Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, - [[4]]) - Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, - [[5]]) - Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[6]]) Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[3]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[7]]) Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[4]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[8]]) + Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, + [[5]]) """ return _worker_info diff --git a/python/paddle/io/multiprocess_utils.py b/python/paddle/io/multiprocess_utils.py index 51b0c2b818214..c57b6dae86b5e 100644 --- a/python/paddle/io/multiprocess_utils.py +++ b/python/paddle/io/multiprocess_utils.py @@ -13,8 +13,6 @@ # limitations under the License. import atexit - -# NOTE: queue has a different name in python2 and python3 import queue import signal import sys diff --git a/python/paddle/io/reader.py b/python/paddle/io/reader.py index 1ee61fa6f2f88..d8db6cc2ab012 100644 --- a/python/paddle/io/reader.py +++ b/python/paddle/io/reader.py @@ -14,8 +14,6 @@ import copy import multiprocessing - -# NOTE: queue has a different name in python2 and python3 import sys import time import warnings From 90343f5bef35732ea9a6403596a8b64eff457c44 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 2 Aug 2023 09:58:55 +0800 Subject: [PATCH 5/7] remove unused imports, test=docs_preview --- python/paddle/io/dataloader/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/io/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py index a01da3292e37e..cf4f8981bef91 100755 --- a/python/paddle/io/dataloader/dataset.py +++ b/python/paddle/io/dataloader/dataset.py @@ -496,7 +496,6 @@ def random_split(dataset, lengths, generator=None): .. code-block:: python >>> import paddle - >>> from paddle.io import random_split >>> paddle.seed(2023) >>> a_list = paddle.io.random_split(range(10), [3, 7]) From 8619edd3aaea9da36be250259598aacc53bd8421 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 2 Aug 2023 11:28:24 +0800 Subject: [PATCH 6/7] skip some device depends apis, test=docs_preview --- python/paddle/io/dataloader/dataset.py | 4 ++-- python/paddle/io/dataloader/worker.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/io/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py index cf4f8981bef91..4790ad5f3fd1e 100755 --- a/python/paddle/io/dataloader/dataset.py +++ b/python/paddle/io/dataloader/dataset.py @@ -504,14 +504,14 @@ def random_split(dataset, lengths, generator=None): >>> # output of the first subset >>> for idx, v in enumerate(a_list[0]): - ... print(idx, v) + ... print(idx, v) # doctest: +skip 0 7 1 6 2 5 >>> # output of the second subset >>> for idx, v in enumerate(a_list[1]): - ... print(idx, v) + ... print(idx, v) # doctest: +skip 0 1 1 9 2 4 diff --git a/python/paddle/io/dataloader/worker.py b/python/paddle/io/dataloader/worker.py index 6e214dbf63102..cd9ca6e081692 100644 --- a/python/paddle/io/dataloader/worker.py +++ b/python/paddle/io/dataloader/worker.py @@ -135,7 +135,7 @@ def get_worker_info(): ... drop_last=True) ... >>> for data in dataloader: - ... print(data) + ... print(data) # doctest: +SKIP Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[2]]) Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, From a89232c32cbd3e3fab559cdbb2f8ef8de93d9ac0 Mon Sep 17 00:00:00 2001 From: SigureMo Date: Wed, 2 Aug 2023 11:32:05 +0800 Subject: [PATCH 7/7] add skip reason, test=docs_preview --- python/paddle/io/dataloader/dataset.py | 8 ++++---- python/paddle/io/dataloader/worker.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/io/dataloader/dataset.py b/python/paddle/io/dataloader/dataset.py index 4790ad5f3fd1e..4daf410a31836 100755 --- a/python/paddle/io/dataloader/dataset.py +++ b/python/paddle/io/dataloader/dataset.py @@ -160,7 +160,7 @@ class IterableDataset(Dataset): ... drop_last=True) ... >>> for data in dataloader: - ... print(data) # doctest: +SKIP + ... print(data) # doctest: +SKIP("The output depends on the environment.") Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[2]]) Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, @@ -218,7 +218,7 @@ class IterableDataset(Dataset): ... worker_init_fn=worker_init_fn) ... >>> for data in dataloader: - ... print(data) # doctest: +SKIP + ... print(data) # doctest: +SKIP("The output depends on the environment.") Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[2]]) Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, @@ -504,14 +504,14 @@ def random_split(dataset, lengths, generator=None): >>> # output of the first subset >>> for idx, v in enumerate(a_list[0]): - ... print(idx, v) # doctest: +skip + ... print(idx, v) # doctest: +SKIP("The output depends on the environment.") 0 7 1 6 2 5 >>> # output of the second subset >>> for idx, v in enumerate(a_list[1]): - ... print(idx, v) # doctest: +skip + ... print(idx, v) # doctest: +SKIP("The output depends on the environment.") 0 1 1 9 2 4 diff --git a/python/paddle/io/dataloader/worker.py b/python/paddle/io/dataloader/worker.py index cd9ca6e081692..4a1667483da64 100644 --- a/python/paddle/io/dataloader/worker.py +++ b/python/paddle/io/dataloader/worker.py @@ -135,7 +135,7 @@ def get_worker_info(): ... drop_last=True) ... >>> for data in dataloader: - ... print(data) # doctest: +SKIP + ... print(data) # doctest: +SKIP("The output depends on the environment.") Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True, [[2]]) Tensor(shape=[1, 1], dtype=int64, place=Place(cpu), stop_gradient=True,