|
""" |
|
test with the TimeGrouper / grouping with datetimes |
|
""" |
|
from datetime import ( |
|
datetime, |
|
timedelta, |
|
) |
|
|
|
import numpy as np |
|
import pytest |
|
import pytz |
|
|
|
import pandas as pd |
|
from pandas import ( |
|
DataFrame, |
|
DatetimeIndex, |
|
Index, |
|
MultiIndex, |
|
Series, |
|
Timestamp, |
|
date_range, |
|
offsets, |
|
) |
|
import pandas._testing as tm |
|
from pandas.core.groupby.grouper import Grouper |
|
from pandas.core.groupby.ops import BinGrouper |
|
|
|
|
|
@pytest.fixture |
|
def frame_for_truncated_bingrouper(): |
|
""" |
|
DataFrame used by groupby_with_truncated_bingrouper, made into |
|
a separate fixture for easier reuse in |
|
test_groupby_apply_timegrouper_with_nat_apply_squeeze |
|
""" |
|
df = DataFrame( |
|
{ |
|
"Quantity": [18, 3, 5, 1, 9, 3], |
|
"Date": [ |
|
Timestamp(2013, 9, 1, 13, 0), |
|
Timestamp(2013, 9, 1, 13, 5), |
|
Timestamp(2013, 10, 1, 20, 0), |
|
Timestamp(2013, 10, 3, 10, 0), |
|
pd.NaT, |
|
Timestamp(2013, 9, 2, 14, 0), |
|
], |
|
} |
|
) |
|
return df |
|
|
|
|
|
@pytest.fixture |
|
def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper): |
|
""" |
|
GroupBy object such that gb._grouper is a BinGrouper and |
|
len(gb._grouper.result_index) < len(gb._grouper.group_keys_seq) |
|
|
|
Aggregations on this groupby should have |
|
|
|
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date") |
|
|
|
As either the index or an index level. |
|
""" |
|
df = frame_for_truncated_bingrouper |
|
|
|
tdg = Grouper(key="Date", freq="5D") |
|
gb = df.groupby(tdg) |
|
|
|
|
|
assert len(gb._grouper.result_index) != len(gb._grouper.group_keys_seq) |
|
|
|
return gb |
|
|
|
|
|
class TestGroupBy: |
|
def test_groupby_with_timegrouper(self): |
|
|
|
|
|
|
|
df_original = DataFrame( |
|
{ |
|
"Buyer": "Carl Carl Carl Carl Joe Carl".split(), |
|
"Quantity": [18, 3, 5, 1, 9, 3], |
|
"Date": [ |
|
datetime(2013, 9, 1, 13, 0), |
|
datetime(2013, 9, 1, 13, 5), |
|
datetime(2013, 10, 1, 20, 0), |
|
datetime(2013, 10, 3, 10, 0), |
|
datetime(2013, 12, 2, 12, 0), |
|
datetime(2013, 9, 2, 14, 0), |
|
], |
|
} |
|
) |
|
|
|
|
|
df_reordered = df_original.sort_values(by="Quantity") |
|
|
|
for df in [df_original, df_reordered]: |
|
df = df.set_index(["Date"]) |
|
|
|
exp_dti = date_range( |
|
"20130901", |
|
"20131205", |
|
freq="5D", |
|
name="Date", |
|
inclusive="left", |
|
unit=df.index.unit, |
|
) |
|
expected = DataFrame( |
|
{"Buyer": 0, "Quantity": 0}, |
|
index=exp_dti, |
|
) |
|
|
|
expected = expected.astype({"Buyer": object}) |
|
expected.iloc[0, 0] = "CarlCarlCarl" |
|
expected.iloc[6, 0] = "CarlCarl" |
|
expected.iloc[18, 0] = "Joe" |
|
expected.iloc[[0, 6, 18], 1] = np.array([24, 6, 9], dtype="int64") |
|
|
|
result1 = df.resample("5D").sum() |
|
tm.assert_frame_equal(result1, expected) |
|
|
|
df_sorted = df.sort_index() |
|
result2 = df_sorted.groupby(Grouper(freq="5D")).sum() |
|
tm.assert_frame_equal(result2, expected) |
|
|
|
result3 = df.groupby(Grouper(freq="5D")).sum() |
|
tm.assert_frame_equal(result3, expected) |
|
|
|
@pytest.mark.parametrize("should_sort", [True, False]) |
|
def test_groupby_with_timegrouper_methods(self, should_sort): |
|
|
|
|
|
|
|
df = DataFrame( |
|
{ |
|
"Branch": "A A A A A B".split(), |
|
"Buyer": "Carl Mark Carl Joe Joe Carl".split(), |
|
"Quantity": [1, 3, 5, 8, 9, 3], |
|
"Date": [ |
|
datetime(2013, 1, 1, 13, 0), |
|
datetime(2013, 1, 1, 13, 5), |
|
datetime(2013, 10, 1, 20, 0), |
|
datetime(2013, 10, 2, 10, 0), |
|
datetime(2013, 12, 2, 12, 0), |
|
datetime(2013, 12, 2, 14, 0), |
|
], |
|
} |
|
) |
|
|
|
if should_sort: |
|
df = df.sort_values(by="Quantity", ascending=False) |
|
|
|
df = df.set_index("Date", drop=False) |
|
g = df.groupby(Grouper(freq="6ME")) |
|
assert g.group_keys |
|
|
|
assert isinstance(g._grouper, BinGrouper) |
|
groups = g.groups |
|
assert isinstance(groups, dict) |
|
assert len(groups) == 3 |
|
|
|
def test_timegrouper_with_reg_groups(self): |
|
|
|
|
|
|
|
df_original = DataFrame( |
|
{ |
|
"Branch": "A A A A A A A B".split(), |
|
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), |
|
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3], |
|
"Date": [ |
|
datetime(2013, 1, 1, 13, 0), |
|
datetime(2013, 1, 1, 13, 5), |
|
datetime(2013, 10, 1, 20, 0), |
|
datetime(2013, 10, 2, 10, 0), |
|
datetime(2013, 10, 1, 20, 0), |
|
datetime(2013, 10, 2, 10, 0), |
|
datetime(2013, 12, 2, 12, 0), |
|
datetime(2013, 12, 2, 14, 0), |
|
], |
|
} |
|
).set_index("Date") |
|
|
|
df_sorted = df_original.sort_values(by="Quantity", ascending=False) |
|
|
|
for df in [df_original, df_sorted]: |
|
expected = DataFrame( |
|
{ |
|
"Buyer": "Carl Joe Mark".split(), |
|
"Quantity": [10, 18, 3], |
|
"Date": [ |
|
datetime(2013, 12, 31, 0, 0), |
|
datetime(2013, 12, 31, 0, 0), |
|
datetime(2013, 12, 31, 0, 0), |
|
], |
|
} |
|
).set_index(["Date", "Buyer"]) |
|
|
|
msg = "The default value of numeric_only" |
|
result = df.groupby([Grouper(freq="YE"), "Buyer"]).sum(numeric_only=True) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
expected = DataFrame( |
|
{ |
|
"Buyer": "Carl Mark Carl Joe".split(), |
|
"Quantity": [1, 3, 9, 18], |
|
"Date": [ |
|
datetime(2013, 1, 1, 0, 0), |
|
datetime(2013, 1, 1, 0, 0), |
|
datetime(2013, 7, 1, 0, 0), |
|
datetime(2013, 7, 1, 0, 0), |
|
], |
|
} |
|
).set_index(["Date", "Buyer"]) |
|
result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
df_original = DataFrame( |
|
{ |
|
"Branch": "A A A A A A A B".split(), |
|
"Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(), |
|
"Quantity": [1, 3, 5, 1, 8, 1, 9, 3], |
|
"Date": [ |
|
datetime(2013, 10, 1, 13, 0), |
|
datetime(2013, 10, 1, 13, 5), |
|
datetime(2013, 10, 1, 20, 0), |
|
datetime(2013, 10, 2, 10, 0), |
|
datetime(2013, 10, 1, 20, 0), |
|
datetime(2013, 10, 2, 10, 0), |
|
datetime(2013, 10, 2, 12, 0), |
|
datetime(2013, 10, 2, 14, 0), |
|
], |
|
} |
|
).set_index("Date") |
|
|
|
df_sorted = df_original.sort_values(by="Quantity", ascending=False) |
|
for df in [df_original, df_sorted]: |
|
expected = DataFrame( |
|
{ |
|
"Buyer": "Carl Joe Mark Carl Joe".split(), |
|
"Quantity": [6, 8, 3, 4, 10], |
|
"Date": [ |
|
datetime(2013, 10, 1, 0, 0), |
|
datetime(2013, 10, 1, 0, 0), |
|
datetime(2013, 10, 1, 0, 0), |
|
datetime(2013, 10, 2, 0, 0), |
|
datetime(2013, 10, 2, 0, 0), |
|
], |
|
} |
|
).set_index(["Date", "Buyer"]) |
|
|
|
result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = df.groupby([Grouper(freq="1ME"), "Buyer"]).sum(numeric_only=True) |
|
expected = DataFrame( |
|
{ |
|
"Buyer": "Carl Joe Mark".split(), |
|
"Quantity": [10, 18, 3], |
|
"Date": [ |
|
datetime(2013, 10, 31, 0, 0), |
|
datetime(2013, 10, 31, 0, 0), |
|
datetime(2013, 10, 31, 0, 0), |
|
], |
|
} |
|
).set_index(["Date", "Buyer"]) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
df = df.reset_index() |
|
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( |
|
numeric_only=True |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
with pytest.raises(KeyError, match="'The grouper name foo is not found'"): |
|
df.groupby([Grouper(freq="1ME", key="foo"), "Buyer"]).sum() |
|
|
|
|
|
df = df.set_index("Date") |
|
result = df.groupby([Grouper(freq="1ME", level="Date"), "Buyer"]).sum( |
|
numeric_only=True |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
result = df.groupby([Grouper(freq="1ME", level=0), "Buyer"]).sum( |
|
numeric_only=True |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
with pytest.raises(ValueError, match="The level foo is not valid"): |
|
df.groupby([Grouper(freq="1ME", level="foo"), "Buyer"]).sum() |
|
|
|
|
|
df = df.copy() |
|
df["Date"] = df.index + offsets.MonthEnd(2) |
|
result = df.groupby([Grouper(freq="1ME", key="Date"), "Buyer"]).sum( |
|
numeric_only=True |
|
) |
|
expected = DataFrame( |
|
{ |
|
"Buyer": "Carl Joe Mark".split(), |
|
"Quantity": [10, 18, 3], |
|
"Date": [ |
|
datetime(2013, 11, 30, 0, 0), |
|
datetime(2013, 11, 30, 0, 0), |
|
datetime(2013, 11, 30, 0, 0), |
|
], |
|
} |
|
).set_index(["Date", "Buyer"]) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
msg = "The Grouper cannot specify both a key and a level!" |
|
with pytest.raises(ValueError, match=msg): |
|
df.groupby( |
|
[Grouper(freq="1ME", key="Date", level="Date"), "Buyer"] |
|
).sum() |
|
|
|
|
|
expected = DataFrame( |
|
[[31]], |
|
columns=["Quantity"], |
|
index=DatetimeIndex( |
|
[datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date" |
|
), |
|
) |
|
result = df.groupby(Grouper(freq="1ME")).sum(numeric_only=True) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = df.groupby([Grouper(freq="1ME")]).sum(numeric_only=True) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
expected.index = expected.index.shift(1) |
|
assert expected.index.freq == offsets.MonthEnd() |
|
result = df.groupby(Grouper(freq="1ME", key="Date")).sum(numeric_only=True) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = df.groupby([Grouper(freq="1ME", key="Date")]).sum( |
|
numeric_only=True |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
@pytest.mark.parametrize("freq", ["D", "ME", "YE", "QE-APR"]) |
|
def test_timegrouper_with_reg_groups_freq(self, freq): |
|
|
|
df = DataFrame( |
|
{ |
|
"date": pd.to_datetime( |
|
[ |
|
"20121002", |
|
"20121007", |
|
"20130130", |
|
"20130202", |
|
"20130305", |
|
"20121002", |
|
"20121207", |
|
"20130130", |
|
"20130202", |
|
"20130305", |
|
"20130202", |
|
"20130305", |
|
] |
|
), |
|
"user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], |
|
"whole_cost": [ |
|
1790, |
|
364, |
|
280, |
|
259, |
|
201, |
|
623, |
|
90, |
|
312, |
|
359, |
|
301, |
|
359, |
|
801, |
|
], |
|
"cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12], |
|
} |
|
).set_index("date") |
|
|
|
expected = ( |
|
df.groupby("user_id")["whole_cost"] |
|
.resample(freq) |
|
.sum(min_count=1) |
|
.dropna() |
|
.reorder_levels(["date", "user_id"]) |
|
.sort_index() |
|
.astype("int64") |
|
) |
|
expected.name = "whole_cost" |
|
|
|
result1 = ( |
|
df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum() |
|
) |
|
tm.assert_series_equal(result1, expected) |
|
|
|
result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum() |
|
tm.assert_series_equal(result2, expected) |
|
|
|
def test_timegrouper_get_group(self): |
|
|
|
|
|
df_original = DataFrame( |
|
{ |
|
"Buyer": "Carl Joe Joe Carl Joe Carl".split(), |
|
"Quantity": [18, 3, 5, 1, 9, 3], |
|
"Date": [ |
|
datetime(2013, 9, 1, 13, 0), |
|
datetime(2013, 9, 1, 13, 5), |
|
datetime(2013, 10, 1, 20, 0), |
|
datetime(2013, 10, 3, 10, 0), |
|
datetime(2013, 12, 2, 12, 0), |
|
datetime(2013, 9, 2, 14, 0), |
|
], |
|
} |
|
) |
|
df_reordered = df_original.sort_values(by="Quantity") |
|
|
|
|
|
expected_list = [ |
|
df_original.iloc[[0, 1, 5]], |
|
df_original.iloc[[2, 3]], |
|
df_original.iloc[[4]], |
|
] |
|
dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"] |
|
|
|
for df in [df_original, df_reordered]: |
|
grouped = df.groupby(Grouper(freq="ME", key="Date")) |
|
for t, expected in zip(dt_list, expected_list): |
|
dt = Timestamp(t) |
|
result = grouped.get_group(dt) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
expected_list = [ |
|
df_original.iloc[[1]], |
|
df_original.iloc[[3]], |
|
df_original.iloc[[4]], |
|
] |
|
g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")] |
|
|
|
for df in [df_original, df_reordered]: |
|
grouped = df.groupby(["Buyer", Grouper(freq="ME", key="Date")]) |
|
for (b, t), expected in zip(g_list, expected_list): |
|
dt = Timestamp(t) |
|
result = grouped.get_group((b, dt)) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
df_original = df_original.set_index("Date") |
|
df_reordered = df_original.sort_values(by="Quantity") |
|
|
|
expected_list = [ |
|
df_original.iloc[[0, 1, 5]], |
|
df_original.iloc[[2, 3]], |
|
df_original.iloc[[4]], |
|
] |
|
|
|
for df in [df_original, df_reordered]: |
|
grouped = df.groupby(Grouper(freq="ME")) |
|
for t, expected in zip(dt_list, expected_list): |
|
dt = Timestamp(t) |
|
result = grouped.get_group(dt) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
def test_timegrouper_apply_return_type_series(self): |
|
|
|
|
|
|
|
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) |
|
df_dt = df.copy() |
|
df_dt["date"] = pd.to_datetime(df_dt["date"]) |
|
|
|
def sumfunc_series(x): |
|
return Series([x["value"].sum()], ("sum",)) |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
expected = df.groupby(Grouper(key="date")).apply(sumfunc_series) |
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_series) |
|
tm.assert_frame_equal( |
|
result.reset_index(drop=True), expected.reset_index(drop=True) |
|
) |
|
|
|
def test_timegrouper_apply_return_type_value(self): |
|
|
|
|
|
|
|
df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]}) |
|
df_dt = df.copy() |
|
df_dt["date"] = pd.to_datetime(df_dt["date"]) |
|
|
|
def sumfunc_value(x): |
|
return x.value.sum() |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
expected = df.groupby(Grouper(key="date")).apply(sumfunc_value) |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result = df_dt.groupby(Grouper(freq="ME", key="date")).apply(sumfunc_value) |
|
tm.assert_series_equal( |
|
result.reset_index(drop=True), expected.reset_index(drop=True) |
|
) |
|
|
|
def test_groupby_groups_datetimeindex(self): |
|
|
|
periods = 1000 |
|
ind = date_range(start="2012/1/1", freq="5min", periods=periods) |
|
df = DataFrame( |
|
{"high": np.arange(periods), "low": np.arange(periods)}, index=ind |
|
) |
|
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) |
|
|
|
|
|
groups = grouped.groups |
|
assert isinstance(next(iter(groups.keys())), datetime) |
|
|
|
def test_groupby_groups_datetimeindex2(self): |
|
|
|
index = date_range("2015/01/01", periods=5, name="date") |
|
df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index) |
|
result = df.groupby(level="date").groups |
|
dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"] |
|
expected = { |
|
Timestamp(date): DatetimeIndex([date], name="date") for date in dates |
|
} |
|
tm.assert_dict_equal(result, expected) |
|
|
|
grouped = df.groupby(level="date") |
|
for date in dates: |
|
result = grouped.get_group(date) |
|
data = [[df.loc[date, "A"], df.loc[date, "B"]]] |
|
expected_index = DatetimeIndex( |
|
[date], name="date", freq="D", dtype=index.dtype |
|
) |
|
expected = DataFrame(data, columns=list("AB"), index=expected_index) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
def test_groupby_groups_datetimeindex_tz(self): |
|
|
|
dates = [ |
|
"2011-07-19 07:00:00", |
|
"2011-07-19 08:00:00", |
|
"2011-07-19 09:00:00", |
|
"2011-07-19 07:00:00", |
|
"2011-07-19 08:00:00", |
|
"2011-07-19 09:00:00", |
|
] |
|
df = DataFrame( |
|
{ |
|
"label": ["a", "a", "a", "b", "b", "b"], |
|
"datetime": dates, |
|
"value1": np.arange(6, dtype="int64"), |
|
"value2": [1, 2] * 3, |
|
} |
|
) |
|
df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific")) |
|
|
|
exp_idx1 = DatetimeIndex( |
|
[ |
|
"2011-07-19 07:00:00", |
|
"2011-07-19 07:00:00", |
|
"2011-07-19 08:00:00", |
|
"2011-07-19 08:00:00", |
|
"2011-07-19 09:00:00", |
|
"2011-07-19 09:00:00", |
|
], |
|
tz="US/Pacific", |
|
name="datetime", |
|
) |
|
exp_idx2 = Index(["a", "b"] * 3, name="label") |
|
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) |
|
expected = DataFrame( |
|
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, |
|
index=exp_idx, |
|
columns=["value1", "value2"], |
|
) |
|
|
|
result = df.groupby(["datetime", "label"]).sum() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
didx = DatetimeIndex(dates, tz="Asia/Tokyo") |
|
df = DataFrame( |
|
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, |
|
index=didx, |
|
) |
|
|
|
exp_idx = DatetimeIndex( |
|
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], |
|
tz="Asia/Tokyo", |
|
) |
|
expected = DataFrame( |
|
{"value1": [3, 5, 7], "value2": [2, 4, 6]}, |
|
index=exp_idx, |
|
columns=["value1", "value2"], |
|
) |
|
|
|
result = df.groupby(level=0).sum() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
def test_frame_datetime64_handling_groupby(self): |
|
|
|
df = DataFrame( |
|
[(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))], |
|
columns=["a", "date"], |
|
) |
|
result = df.groupby("a").first() |
|
assert result["date"][3] == Timestamp("2012-07-03") |
|
|
|
def test_groupby_multi_timezone(self): |
|
|
|
df = DataFrame( |
|
{ |
|
"value": range(5), |
|
"date": [ |
|
"2000-01-28 16:47:00", |
|
"2000-01-29 16:48:00", |
|
"2000-01-30 16:49:00", |
|
"2000-01-31 16:50:00", |
|
"2000-01-01 16:50:00", |
|
], |
|
"tz": [ |
|
"America/Chicago", |
|
"America/Chicago", |
|
"America/Los_Angeles", |
|
"America/Chicago", |
|
"America/New_York", |
|
], |
|
} |
|
) |
|
|
|
result = df.groupby("tz", group_keys=False).date.apply( |
|
lambda x: pd.to_datetime(x).dt.tz_localize(x.name) |
|
) |
|
|
|
expected = Series( |
|
[ |
|
Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"), |
|
Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"), |
|
Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"), |
|
Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"), |
|
Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"), |
|
], |
|
name="date", |
|
dtype=object, |
|
) |
|
tm.assert_series_equal(result, expected) |
|
|
|
tz = "America/Chicago" |
|
res_values = df.groupby("tz").date.get_group(tz) |
|
result = pd.to_datetime(res_values).dt.tz_localize(tz) |
|
exp_values = Series( |
|
["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"], |
|
index=[0, 1, 3], |
|
name="date", |
|
) |
|
expected = pd.to_datetime(exp_values).dt.tz_localize(tz) |
|
tm.assert_series_equal(result, expected) |
|
|
|
def test_groupby_groups_periods(self): |
|
dates = [ |
|
"2011-07-19 07:00:00", |
|
"2011-07-19 08:00:00", |
|
"2011-07-19 09:00:00", |
|
"2011-07-19 07:00:00", |
|
"2011-07-19 08:00:00", |
|
"2011-07-19 09:00:00", |
|
] |
|
df = DataFrame( |
|
{ |
|
"label": ["a", "a", "a", "b", "b", "b"], |
|
"period": [pd.Period(d, freq="h") for d in dates], |
|
"value1": np.arange(6, dtype="int64"), |
|
"value2": [1, 2] * 3, |
|
} |
|
) |
|
|
|
exp_idx1 = pd.PeriodIndex( |
|
[ |
|
"2011-07-19 07:00:00", |
|
"2011-07-19 07:00:00", |
|
"2011-07-19 08:00:00", |
|
"2011-07-19 08:00:00", |
|
"2011-07-19 09:00:00", |
|
"2011-07-19 09:00:00", |
|
], |
|
freq="h", |
|
name="period", |
|
) |
|
exp_idx2 = Index(["a", "b"] * 3, name="label") |
|
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) |
|
expected = DataFrame( |
|
{"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]}, |
|
index=exp_idx, |
|
columns=["value1", "value2"], |
|
) |
|
|
|
result = df.groupby(["period", "label"]).sum() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
didx = pd.PeriodIndex(dates, freq="h") |
|
df = DataFrame( |
|
{"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]}, |
|
index=didx, |
|
) |
|
|
|
exp_idx = pd.PeriodIndex( |
|
["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"], |
|
freq="h", |
|
) |
|
expected = DataFrame( |
|
{"value1": [3, 5, 7], "value2": [2, 4, 6]}, |
|
index=exp_idx, |
|
columns=["value1", "value2"], |
|
) |
|
|
|
result = df.groupby(level=0).sum() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
def test_groupby_first_datetime64(self): |
|
df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)]) |
|
df[1] = df[1].astype("M8[ns]") |
|
|
|
assert issubclass(df[1].dtype.type, np.datetime64) |
|
|
|
result = df.groupby(level=0).first() |
|
got_dt = result[1].dtype |
|
assert issubclass(got_dt.type, np.datetime64) |
|
|
|
result = df[1].groupby(level=0).first() |
|
got_dt = result.dtype |
|
assert issubclass(got_dt.type, np.datetime64) |
|
|
|
def test_groupby_max_datetime64(self): |
|
|
|
|
|
df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)}) |
|
|
|
expected = df.groupby("A")["A"].apply(lambda x: x.max()).astype("M8[s]") |
|
result = df.groupby("A")["A"].max() |
|
tm.assert_series_equal(result, expected) |
|
|
|
def test_groupby_datetime64_32_bit(self): |
|
|
|
|
|
|
|
df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2}) |
|
result = df.groupby("A")["B"].transform("min") |
|
expected = Series([Timestamp("2000-01-1")] * 2, name="B") |
|
tm.assert_series_equal(result, expected) |
|
|
|
def test_groupby_with_timezone_selection(self): |
|
|
|
|
|
|
|
df = DataFrame( |
|
{ |
|
"factor": np.random.default_rng(2).integers(0, 3, size=60), |
|
"time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"), |
|
} |
|
) |
|
df1 = df.groupby("factor").max()["time"] |
|
df2 = df.groupby("factor")["time"].max() |
|
tm.assert_series_equal(df1, df2) |
|
|
|
def test_timezone_info(self): |
|
|
|
|
|
|
|
df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]}) |
|
assert df["b"][0].tzinfo == pytz.utc |
|
df = DataFrame({"a": [1, 2, 3]}) |
|
df["b"] = datetime.now(pytz.utc) |
|
assert df["b"][0].tzinfo == pytz.utc |
|
|
|
def test_datetime_count(self): |
|
df = DataFrame( |
|
{"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="min")} |
|
) |
|
result = df.groupby("a").dates.count() |
|
expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates") |
|
tm.assert_series_equal(result, expected) |
|
|
|
def test_first_last_max_min_on_time_data(self): |
|
|
|
|
|
|
|
df_test = DataFrame( |
|
{ |
|
"dt": [ |
|
np.nan, |
|
"2015-07-24 10:10", |
|
"2015-07-25 11:11", |
|
"2015-07-23 12:12", |
|
np.nan, |
|
], |
|
"td": [ |
|
np.nan, |
|
timedelta(days=1), |
|
timedelta(days=2), |
|
timedelta(days=3), |
|
np.nan, |
|
], |
|
} |
|
) |
|
df_test.dt = pd.to_datetime(df_test.dt) |
|
df_test["group"] = "A" |
|
df_ref = df_test[df_test.dt.notna()] |
|
|
|
grouped_test = df_test.groupby("group") |
|
grouped_ref = df_ref.groupby("group") |
|
|
|
tm.assert_frame_equal(grouped_ref.max(), grouped_test.max()) |
|
tm.assert_frame_equal(grouped_ref.min(), grouped_test.min()) |
|
tm.assert_frame_equal(grouped_ref.first(), grouped_test.first()) |
|
tm.assert_frame_equal(grouped_ref.last(), grouped_test.last()) |
|
|
|
def test_nunique_with_timegrouper_and_nat(self): |
|
|
|
test = DataFrame( |
|
{ |
|
"time": [ |
|
Timestamp("2016-06-28 09:35:35"), |
|
pd.NaT, |
|
Timestamp("2016-06-28 16:46:28"), |
|
], |
|
"data": ["1", "2", "3"], |
|
} |
|
) |
|
|
|
grouper = Grouper(key="time", freq="h") |
|
result = test.groupby(grouper)["data"].nunique() |
|
expected = test[test.time.notnull()].groupby(grouper)["data"].nunique() |
|
expected.index = expected.index._with_freq(None) |
|
tm.assert_series_equal(result, expected) |
|
|
|
def test_scalar_call_versus_list_call(self): |
|
|
|
data_frame = { |
|
"location": ["shanghai", "beijing", "shanghai"], |
|
"time": Series( |
|
["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"], |
|
dtype="datetime64[ns]", |
|
), |
|
"value": [1, 2, 3], |
|
} |
|
data_frame = DataFrame(data_frame).set_index("time") |
|
grouper = Grouper(freq="D") |
|
|
|
grouped = data_frame.groupby(grouper) |
|
result = grouped.count() |
|
grouped = data_frame.groupby([grouper]) |
|
expected = grouped.count() |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
def test_grouper_period_index(self): |
|
|
|
periods = 2 |
|
index = pd.period_range( |
|
start="2018-01", periods=periods, freq="M", name="Month" |
|
) |
|
period_series = Series(range(periods), index=index) |
|
result = period_series.groupby(period_series.index.month).sum() |
|
|
|
expected = Series( |
|
range(periods), index=Index(range(1, periods + 1), name=index.name) |
|
) |
|
tm.assert_series_equal(result, expected) |
|
|
|
def test_groupby_apply_timegrouper_with_nat_dict_returns( |
|
self, groupby_with_truncated_bingrouper |
|
): |
|
|
|
|
|
|
|
gb = groupby_with_truncated_bingrouper |
|
|
|
res = gb["Quantity"].apply(lambda x: {"foo": len(x)}) |
|
|
|
df = gb.obj |
|
unit = df["Date"]._values.unit |
|
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit) |
|
mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)]) |
|
expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity") |
|
tm.assert_series_equal(res, expected) |
|
|
|
def test_groupby_apply_timegrouper_with_nat_scalar_returns( |
|
self, groupby_with_truncated_bingrouper |
|
): |
|
|
|
|
|
gb = groupby_with_truncated_bingrouper |
|
|
|
res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan) |
|
|
|
df = gb.obj |
|
unit = df["Date"]._values.unit |
|
dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date", unit=unit) |
|
expected = Series( |
|
[18, np.nan, np.nan, np.nan, np.nan, np.nan, 5], |
|
index=dti._with_freq(None), |
|
name="Quantity", |
|
) |
|
|
|
tm.assert_series_equal(res, expected) |
|
|
|
def test_groupby_apply_timegrouper_with_nat_apply_squeeze( |
|
self, frame_for_truncated_bingrouper |
|
): |
|
df = frame_for_truncated_bingrouper |
|
|
|
|
|
|
|
tdg = Grouper(key="Date", freq="100YE") |
|
gb = df.groupby(tdg) |
|
|
|
|
|
|
|
assert gb.ngroups == 1 |
|
assert gb._selected_obj._get_axis(gb.axis).nlevels == 1 |
|
|
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
res = gb.apply(lambda x: x["Quantity"] * 2) |
|
|
|
dti = Index([Timestamp("2013-12-31")], dtype=df["Date"].dtype, name="Date") |
|
expected = DataFrame( |
|
[[36, 6, 6, 10, 2]], |
|
index=dti, |
|
columns=Index([0, 1, 5, 2, 3], name="Quantity"), |
|
) |
|
tm.assert_frame_equal(res, expected) |
|
|
|
@pytest.mark.single_cpu |
|
def test_groupby_agg_numba_timegrouper_with_nat( |
|
self, groupby_with_truncated_bingrouper |
|
): |
|
pytest.importorskip("numba") |
|
|
|
|
|
gb = groupby_with_truncated_bingrouper |
|
|
|
result = gb["Quantity"].aggregate( |
|
lambda values, index: np.nanmean(values), engine="numba" |
|
) |
|
|
|
expected = gb["Quantity"].aggregate("mean") |
|
tm.assert_series_equal(result, expected) |
|
|
|
result_df = gb[["Quantity"]].aggregate( |
|
lambda values, index: np.nanmean(values), engine="numba" |
|
) |
|
expected_df = gb[["Quantity"]].aggregate("mean") |
|
tm.assert_frame_equal(result_df, expected_df) |
|
|