Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,901 Bytes
d1ed09d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 |
import random
from dask.utils import import_required
def timeseries(
start="2000-01-01",
end="2000-01-31",
freq="1s",
partition_freq="1d",
dtypes=None,
seed=None,
**kwargs,
):
"""Create timeseries dataframe with random data
Parameters
----------
start : datetime (or datetime-like string)
Start of time series
end : datetime (or datetime-like string)
End of time series
dtypes : dict (optional)
Mapping of column names to types.
Valid types include {float, int, str, 'category'}
freq : string
String like '2s' or '1H' or '12W' for the time series frequency
partition_freq : string
String like '1M' or '2Y' to divide the dataframe into partitions
seed : int (optional)
Randomstate seed
kwargs:
Keywords to pass down to individual column creation functions.
Keywords should be prefixed by the column name and then an underscore.
Examples
--------
>>> import dask
>>> df = dask.datasets.timeseries()
>>> df.head() # doctest: +SKIP
timestamp id name x y
2000-01-01 00:00:00 967 Jerry -0.031348 -0.040633
2000-01-01 00:00:01 1066 Michael -0.262136 0.307107
2000-01-01 00:00:02 988 Wendy -0.526331 0.128641
2000-01-01 00:00:03 1016 Yvonne 0.620456 0.767270
2000-01-01 00:00:04 998 Ursula 0.684902 -0.463278
>>> df = dask.datasets.timeseries(
... '2000', '2010',
... freq='2H', partition_freq='1D', seed=1, # data frequency
... dtypes={'value': float, 'name': str, 'id': int}, # data types
... id_lam=1000 # control number of items in id column
... )
"""
from dask.dataframe.io.demo import make_timeseries
if dtypes is None:
dtypes = {"name": str, "id": int, "x": float, "y": float}
return make_timeseries(
start=start,
end=end,
freq=freq,
partition_freq=partition_freq,
seed=seed,
dtypes=dtypes,
**kwargs,
)
def _generate_mimesis(field, schema_description, records_per_partition, seed):
"""Generate data for a single partition of a dask bag
See Also
--------
_make_mimesis
"""
from mimesis.schema import Field, Schema
field = Field(seed=seed, **field)
schema = Schema(schema=lambda: schema_description(field))
return [schema.create(iterations=1)[0] for i in range(records_per_partition)]
def _make_mimesis(field, schema, npartitions, records_per_partition, seed=None):
"""
Make a Dask Bag filled with data randomly generated by the mimesis projet
Parameters
----------
field: dict
keyword arguments to pass to ``mimesis.Field``
schema: Callable[Field] -> dict
The schema to use to generate the data
npartitions: int
records_per_partition: int
seed: int, None
Seed for random data
Returns
-------
Dask Bag
See Also
--------
make_people
"""
import dask.bag as db
from dask.base import tokenize
field = field or {}
random_state = random.Random(seed)
seeds = [random_state.randint(0, 1 << 32) for _ in range(npartitions)]
name = "mimesis-" + tokenize(
field, schema, npartitions, records_per_partition, seed
)
dsk = {
(name, i): (_generate_mimesis, field, schema, records_per_partition, seed)
for i, seed in enumerate(seeds)
}
return db.Bag(dsk, name, npartitions)
def make_people(npartitions=10, records_per_partition=1000, seed=None, locale="en"):
"""Make a dataset of random people
This makes a Dask Bag with dictionary records of randomly generated people.
This requires the optional library ``mimesis`` to generate records.
Parameters
----------
npartitions : int
Number of partitions
records_per_partition : int
Number of records in each partition
seed : int, (optional)
Random seed
locale : str
Language locale, like 'en', 'fr', 'zh', or 'ru'
Returns
-------
b: Dask Bag
"""
import_required(
"mimesis",
"The mimesis module is required for this function. Try:\n"
" python -m pip install mimesis",
)
schema = lambda field: {
"age": field("person.age"),
"name": (field("person.name"), field("person.surname")),
"occupation": field("person.occupation"),
"telephone": field("person.telephone"),
"address": {"address": field("address.address"), "city": field("address.city")},
"credit-card": {
"number": field("payment.credit_card_number"),
"expiration-date": field("payment.credit_card_expiration_date"),
},
}
return _make_mimesis(
{"locale": locale}, schema, npartitions, records_per_partition, seed
)
|