File size: 4,901 Bytes
d1ed09d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import random

from dask.utils import import_required


def timeseries(
    start="2000-01-01",
    end="2000-01-31",
    freq="1s",
    partition_freq="1d",
    dtypes=None,
    seed=None,
    **kwargs,
):
    """Create timeseries dataframe with random data

    Parameters
    ----------
    start : datetime (or datetime-like string)
        Start of time series
    end : datetime (or datetime-like string)
        End of time series
    dtypes : dict (optional)
        Mapping of column names to types.
        Valid types include {float, int, str, 'category'}
    freq : string
        String like '2s' or '1H' or '12W' for the time series frequency
    partition_freq : string
        String like '1M' or '2Y' to divide the dataframe into partitions
    seed : int (optional)
        Randomstate seed
    kwargs:
        Keywords to pass down to individual column creation functions.
        Keywords should be prefixed by the column name and then an underscore.

    Examples
    --------
    >>> import dask
    >>> df = dask.datasets.timeseries()
    >>> df.head()  # doctest: +SKIP
              timestamp    id     name         x         y
    2000-01-01 00:00:00   967    Jerry -0.031348 -0.040633
    2000-01-01 00:00:01  1066  Michael -0.262136  0.307107
    2000-01-01 00:00:02   988    Wendy -0.526331  0.128641
    2000-01-01 00:00:03  1016   Yvonne  0.620456  0.767270
    2000-01-01 00:00:04   998   Ursula  0.684902 -0.463278
    >>> df = dask.datasets.timeseries(
    ...     '2000', '2010',
    ...     freq='2H', partition_freq='1D', seed=1,  # data frequency
    ...     dtypes={'value': float, 'name': str, 'id': int},  # data types
    ...     id_lam=1000  # control number of items in id column
    ... )
    """
    from dask.dataframe.io.demo import make_timeseries

    if dtypes is None:
        dtypes = {"name": str, "id": int, "x": float, "y": float}

    return make_timeseries(
        start=start,
        end=end,
        freq=freq,
        partition_freq=partition_freq,
        seed=seed,
        dtypes=dtypes,
        **kwargs,
    )


def _generate_mimesis(field, schema_description, records_per_partition, seed):
    """Generate data for a single partition of a dask bag

    See Also
    --------
    _make_mimesis
    """
    from mimesis.schema import Field, Schema

    field = Field(seed=seed, **field)
    schema = Schema(schema=lambda: schema_description(field))
    return [schema.create(iterations=1)[0] for i in range(records_per_partition)]


def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None):
    """
    Make a Dask Bag filled with data randomly generated by the mimesis projet

    Parameters
    ----------
    field: dict
        keyword arguments to pass to ``mimesis.Field``
    schema: Callable[Field] -> dict
        The schema to use to generate the data
    npartitions: int
    records_per_partition: int
    seed: int, None
        Seed for random data

    Returns
    -------
    Dask Bag

    See Also
    --------
    make_people
    """
    import dask.bag as db
    from dask.base import tokenize

    field = field or {}

    random_state = random.Random(seed)
    seeds = [random_state.randint(0, 1 << 32) for _ in range(npartitions)]

    name = "mimesis-" + tokenize(
        field, schema, npartitions, records_per_partition, seed
    )
    dsk = {
        (name, i): (_generate_mimesis, field, schema, records_per_partition, seed)
        for i, seed in enumerate(seeds)
    }

    return db.Bag(dsk, name, npartitions)


def make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en"):
    """Make a dataset of random people

    This makes a Dask Bag with dictionary records of randomly generated people.
    This requires the optional library ``mimesis`` to generate records.

    Parameters
    ----------
    npartitions : int
        Number of partitions
    records_per_partition : int
        Number of records in each partition
    seed : int, (optional)
        Random seed
    locale : str
        Language locale, like 'en', 'fr', 'zh', or 'ru'

    Returns
    -------
    b: Dask Bag
    """
    import_required(
        "mimesis",
        "The mimesis module is required for this function.  Try:\n"
        "  python -m pip install mimesis",
    )

    schema = lambda field: {
        "age": field("person.age"),
        "name": (field("person.name"), field("person.surname")),
        "occupation": field("person.occupation"),
        "telephone": field("person.telephone"),
        "address": {"address": field("address.address"), "city": field("address.city")},
        "credit-card": {
            "number": field("payment.credit_card_number"),
            "expiration-date": field("payment.credit_card_expiration_date"),
        },
    }

    return _make_mimesis(
        {"locale": locale}, schema, npartitions, records_per_partition, seed
    )