|
from datetime import datetime |
|
import decimal |
|
from decimal import Decimal |
|
import re |
|
|
|
import numpy as np |
|
import pytest |
|
|
|
from pandas.errors import ( |
|
PerformanceWarning, |
|
SpecificationError, |
|
) |
|
import pandas.util._test_decorators as td |
|
|
|
from pandas.core.dtypes.common import is_string_dtype |
|
|
|
import pandas as pd |
|
from pandas import ( |
|
Categorical, |
|
DataFrame, |
|
Grouper, |
|
Index, |
|
Interval, |
|
MultiIndex, |
|
RangeIndex, |
|
Series, |
|
Timedelta, |
|
Timestamp, |
|
date_range, |
|
to_datetime, |
|
) |
|
import pandas._testing as tm |
|
from pandas.core.arrays import BooleanArray |
|
import pandas.core.common as com |
|
|
|
pytestmark = pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning") |
|
|
|
|
|
def test_repr(): |
|
|
|
result = repr(Grouper(key="A", level="B")) |
|
expected = "Grouper(key='A', level='B', axis=0, sort=False, dropna=True)" |
|
assert result == expected |
|
|
|
|
|
def test_groupby_std_datetimelike(warn_copy_on_write): |
|
|
|
tdi = pd.timedelta_range("1 Day", periods=10000) |
|
ser = Series(tdi) |
|
ser[::5] *= 2 |
|
|
|
df = ser.to_frame("A").copy() |
|
|
|
df["B"] = ser + Timestamp(0) |
|
df["C"] = ser + Timestamp(0, tz="UTC") |
|
df.iloc[-1] = pd.NaT |
|
|
|
gb = df.groupby(list(range(5)) * 2000) |
|
|
|
result = gb.std() |
|
|
|
|
|
|
|
|
|
|
|
td1 = Timedelta("2887 days 11:21:02.326710176") |
|
td4 = Timedelta("2886 days 00:42:34.664668096") |
|
exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) |
|
expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) |
|
def test_basic_aggregations(dtype): |
|
data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype) |
|
|
|
index = np.arange(9) |
|
np.random.default_rng(2).shuffle(index) |
|
data = data.reindex(index) |
|
|
|
grouped = data.groupby(lambda x: x // 3, group_keys=False) |
|
|
|
for k, v in grouped: |
|
assert len(v) == 3 |
|
|
|
msg = "using SeriesGroupBy.mean" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
agged = grouped.aggregate(np.mean) |
|
assert agged[1] == 1 |
|
|
|
msg = "using SeriesGroupBy.mean" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
expected = grouped.agg(np.mean) |
|
tm.assert_series_equal(agged, expected) |
|
tm.assert_series_equal(agged, grouped.mean()) |
|
result = grouped.sum() |
|
msg = "using SeriesGroupBy.sum" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
expected = grouped.agg(np.sum) |
|
tm.assert_series_equal(result, expected) |
|
|
|
expected = grouped.apply(lambda x: x * x.sum()) |
|
transformed = grouped.transform(lambda x: x * x.sum()) |
|
assert transformed[7] == 12 |
|
tm.assert_series_equal(transformed, expected) |
|
|
|
value_grouped = data.groupby(data) |
|
msg = "using SeriesGroupBy.mean" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
result = value_grouped.aggregate(np.mean) |
|
tm.assert_series_equal(result, agged, check_index_type=False) |
|
|
|
|
|
msg = "using SeriesGroupBy.[mean|std]" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
agged = grouped.aggregate([np.mean, np.std]) |
|
|
|
msg = r"nested renamer is not supported" |
|
with pytest.raises(SpecificationError, match=msg): |
|
grouped.aggregate({"one": np.mean, "two": np.std}) |
|
|
|
group_constants = {0: 10, 1: 20, 2: 30} |
|
msg = ( |
|
"Pinning the groupby key to each group in SeriesGroupBy.agg is deprecated, " |
|
"and cases that relied on it will raise in a future version" |
|
) |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
|
|
agged = grouped.agg(lambda x: group_constants[x.name] + x.mean()) |
|
assert agged[1] == 21 |
|
|
|
|
|
msg = "Must produce aggregated value" |
|
|
|
with pytest.raises(Exception, match=msg): |
|
grouped.aggregate(lambda x: x * 2) |
|
|
|
|
|
def test_groupby_nonobject_dtype(multiindex_dataframe_random_data): |
|
key = multiindex_dataframe_random_data.index.codes[0] |
|
grouped = multiindex_dataframe_random_data.groupby(key) |
|
result = grouped.sum() |
|
|
|
expected = multiindex_dataframe_random_data.groupby(key.astype("O")).sum() |
|
assert result.index.dtype == np.int8 |
|
assert expected.index.dtype == np.int64 |
|
tm.assert_frame_equal(result, expected, check_index_type=False) |
|
|
|
|
|
def test_groupby_nonobject_dtype_mixed(): |
|
|
|
df = DataFrame( |
|
{ |
|
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], |
|
"B": ["one", "one", "two", "three", "two", "two", "one", "three"], |
|
"C": np.random.default_rng(2).standard_normal(8), |
|
"D": np.array(np.random.default_rng(2).standard_normal(8), dtype="float32"), |
|
} |
|
) |
|
df["value"] = range(len(df)) |
|
|
|
def max_value(group): |
|
return group.loc[group["value"].idxmax()] |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
applied = df.groupby("A").apply(max_value) |
|
result = applied.dtypes |
|
expected = df.dtypes |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_inconsistent_return_type(): |
|
|
|
|
|
df = DataFrame( |
|
{ |
|
"A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"], |
|
"B": Series(np.arange(7), dtype="int64"), |
|
"C": date_range("20130101", periods=7), |
|
} |
|
) |
|
|
|
def f_0(grp): |
|
return grp.iloc[0] |
|
|
|
expected = df.groupby("A").first()[["B"]] |
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result = df.groupby("A").apply(f_0)[["B"]] |
|
tm.assert_frame_equal(result, expected) |
|
|
|
def f_1(grp): |
|
if grp.name == "Tiger": |
|
return None |
|
return grp.iloc[0] |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result = df.groupby("A").apply(f_1)[["B"]] |
|
e = expected.copy() |
|
e.loc["Tiger"] = np.nan |
|
tm.assert_frame_equal(result, e) |
|
|
|
def f_2(grp): |
|
if grp.name == "Pony": |
|
return None |
|
return grp.iloc[0] |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result = df.groupby("A").apply(f_2)[["B"]] |
|
e = expected.copy() |
|
e.loc["Pony"] = np.nan |
|
tm.assert_frame_equal(result, e) |
|
|
|
|
|
def f_3(grp): |
|
if grp.name == "Pony": |
|
return None |
|
return grp.iloc[0] |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result = df.groupby("A").apply(f_3)[["C"]] |
|
e = df.groupby("A").first()[["C"]] |
|
e.loc["Pony"] = pd.NaT |
|
tm.assert_frame_equal(result, e) |
|
|
|
|
|
def f_4(grp): |
|
if grp.name == "Pony": |
|
return None |
|
return grp.iloc[0].loc["C"] |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result = df.groupby("A").apply(f_4) |
|
e = df.groupby("A").first()["C"].copy() |
|
e.loc["Pony"] = np.nan |
|
e.name = None |
|
tm.assert_series_equal(result, e) |
|
|
|
|
|
def test_pass_args_kwargs(ts, tsframe): |
|
def f(x, q=None, axis=0): |
|
return np.percentile(x, q, axis=axis) |
|
|
|
g = lambda x: np.percentile(x, 80, axis=0) |
|
|
|
|
|
ts_grouped = ts.groupby(lambda x: x.month) |
|
agg_result = ts_grouped.agg(np.percentile, 80, axis=0) |
|
apply_result = ts_grouped.apply(np.percentile, 80, axis=0) |
|
trans_result = ts_grouped.transform(np.percentile, 80, axis=0) |
|
|
|
agg_expected = ts_grouped.quantile(0.8) |
|
trans_expected = ts_grouped.transform(g) |
|
|
|
tm.assert_series_equal(apply_result, agg_expected) |
|
tm.assert_series_equal(agg_result, agg_expected) |
|
tm.assert_series_equal(trans_result, trans_expected) |
|
|
|
agg_result = ts_grouped.agg(f, q=80) |
|
apply_result = ts_grouped.apply(f, q=80) |
|
trans_result = ts_grouped.transform(f, q=80) |
|
tm.assert_series_equal(agg_result, agg_expected) |
|
tm.assert_series_equal(apply_result, agg_expected) |
|
tm.assert_series_equal(trans_result, trans_expected) |
|
|
|
|
|
for as_index in [True, False]: |
|
df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) |
|
warn = None if as_index else FutureWarning |
|
msg = "A grouping .* was excluded from the result" |
|
with tm.assert_produces_warning(warn, match=msg): |
|
agg_result = df_grouped.agg(np.percentile, 80, axis=0) |
|
with tm.assert_produces_warning(warn, match=msg): |
|
apply_result = df_grouped.apply(DataFrame.quantile, 0.8) |
|
with tm.assert_produces_warning(warn, match=msg): |
|
expected = df_grouped.quantile(0.8) |
|
tm.assert_frame_equal(apply_result, expected, check_names=False) |
|
tm.assert_frame_equal(agg_result, expected) |
|
|
|
apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8]) |
|
with tm.assert_produces_warning(warn, match=msg): |
|
expected_seq = df_grouped.quantile([0.4, 0.8]) |
|
tm.assert_frame_equal(apply_result, expected_seq, check_names=False) |
|
|
|
with tm.assert_produces_warning(warn, match=msg): |
|
agg_result = df_grouped.agg(f, q=80) |
|
with tm.assert_produces_warning(warn, match=msg): |
|
apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) |
|
tm.assert_frame_equal(agg_result, expected) |
|
tm.assert_frame_equal(apply_result, expected, check_names=False) |
|
|
|
|
|
@pytest.mark.parametrize("as_index", [True, False]) |
|
def test_pass_args_kwargs_duplicate_columns(tsframe, as_index): |
|
|
|
tsframe.columns = ["A", "B", "A", "C"] |
|
gb = tsframe.groupby(lambda x: x.month, as_index=as_index) |
|
|
|
warn = None if as_index else FutureWarning |
|
msg = "A grouping .* was excluded from the result" |
|
with tm.assert_produces_warning(warn, match=msg): |
|
res = gb.agg(np.percentile, 80, axis=0) |
|
|
|
ex_data = { |
|
1: tsframe[tsframe.index.month == 1].quantile(0.8), |
|
2: tsframe[tsframe.index.month == 2].quantile(0.8), |
|
} |
|
expected = DataFrame(ex_data).T |
|
if not as_index: |
|
|
|
expected.index = Index(range(2)) |
|
|
|
tm.assert_frame_equal(res, expected) |
|
|
|
|
|
def test_len(): |
|
df = DataFrame( |
|
np.random.default_rng(2).standard_normal((10, 4)), |
|
columns=Index(list("ABCD"), dtype=object), |
|
index=date_range("2000-01-01", periods=10, freq="B"), |
|
) |
|
grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) |
|
assert len(grouped) == len(df) |
|
|
|
grouped = df.groupby([lambda x: x.year, lambda x: x.month]) |
|
expected = len({(x.year, x.month) for x in df.index}) |
|
assert len(grouped) == expected |
|
|
|
|
|
def test_len_nan_group(): |
|
|
|
df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]}) |
|
assert len(df.groupby("a")) == 0 |
|
assert len(df.groupby("b")) == 3 |
|
assert len(df.groupby(["a", "b"])) == 3 |
|
|
|
|
|
def test_basic_regression(): |
|
|
|
result = Series([1.0 * x for x in list(range(1, 10)) * 10]) |
|
|
|
data = np.random.default_rng(2).random(1100) * 10.0 |
|
groupings = Series(data) |
|
|
|
grouped = result.groupby(groupings) |
|
grouped.mean() |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"dtype", ["float64", "float32", "int64", "int32", "int16", "int8"] |
|
) |
|
def test_with_na_groups(dtype): |
|
index = Index(np.arange(10)) |
|
values = Series(np.ones(10), index, dtype=dtype) |
|
labels = Series( |
|
[np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"], |
|
index=index, |
|
) |
|
|
|
|
|
grouped = values.groupby(labels) |
|
agged = grouped.agg(len) |
|
expected = Series([4, 2], index=["bar", "foo"]) |
|
|
|
tm.assert_series_equal(agged, expected, check_dtype=False) |
|
|
|
|
|
|
|
|
|
def f(x): |
|
return float(len(x)) |
|
|
|
agged = grouped.agg(f) |
|
expected = Series([4.0, 2.0], index=["bar", "foo"]) |
|
|
|
tm.assert_series_equal(agged, expected) |
|
|
|
|
|
def test_indices_concatenation_order(): |
|
|
|
|
|
def f1(x): |
|
y = x[(x.b % 2) == 1] ** 2 |
|
if y.empty: |
|
multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"]) |
|
res = DataFrame(columns=["a"], index=multiindex) |
|
return res |
|
else: |
|
y = y.set_index(["b", "c"]) |
|
return y |
|
|
|
def f2(x): |
|
y = x[(x.b % 2) == 1] ** 2 |
|
if y.empty: |
|
return DataFrame() |
|
else: |
|
y = y.set_index(["b", "c"]) |
|
return y |
|
|
|
def f3(x): |
|
y = x[(x.b % 2) == 1] ** 2 |
|
if y.empty: |
|
multiindex = MultiIndex( |
|
levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"] |
|
) |
|
res = DataFrame(columns=["a", "b"], index=multiindex) |
|
return res |
|
else: |
|
return y |
|
|
|
df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)}) |
|
|
|
df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)}) |
|
|
|
depr_msg = "The behavior of array concatenation with empty entries is deprecated" |
|
|
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result1 = df.groupby("a").apply(f1) |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result2 = df2.groupby("a").apply(f1) |
|
tm.assert_frame_equal(result1, result2) |
|
|
|
|
|
msg = "Cannot concat indices that do not have the same number of levels" |
|
with pytest.raises(AssertionError, match=msg): |
|
df.groupby("a").apply(f2) |
|
with pytest.raises(AssertionError, match=msg): |
|
df2.groupby("a").apply(f2) |
|
|
|
|
|
with pytest.raises(AssertionError, match=msg): |
|
df.groupby("a").apply(f3) |
|
with pytest.raises(AssertionError, match=msg): |
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
df2.groupby("a").apply(f3) |
|
|
|
|
|
def test_attr_wrapper(ts): |
|
grouped = ts.groupby(lambda x: x.weekday()) |
|
|
|
result = grouped.std() |
|
expected = grouped.agg(lambda x: np.std(x, ddof=1)) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
result = grouped.describe() |
|
expected = {name: gp.describe() for name, gp in grouped} |
|
expected = DataFrame(expected).T |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
result = grouped.dtype |
|
expected = grouped.agg(lambda x: x.dtype) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
msg = "'SeriesGroupBy' object has no attribute 'foo'" |
|
with pytest.raises(AttributeError, match=msg): |
|
getattr(grouped, "foo") |
|
|
|
|
|
def test_frame_groupby(tsframe): |
|
grouped = tsframe.groupby(lambda x: x.weekday()) |
|
|
|
|
|
aggregated = grouped.aggregate("mean") |
|
assert len(aggregated) == 5 |
|
assert len(aggregated.columns) == 4 |
|
|
|
|
|
tscopy = tsframe.copy() |
|
tscopy["weekday"] = [x.weekday() for x in tscopy.index] |
|
stragged = tscopy.groupby("weekday").aggregate("mean") |
|
tm.assert_frame_equal(stragged, aggregated, check_names=False) |
|
|
|
|
|
grouped = tsframe.head(30).groupby(lambda x: x.weekday()) |
|
transformed = grouped.transform(lambda x: x - x.mean()) |
|
assert len(transformed) == 30 |
|
assert len(transformed.columns) == 4 |
|
|
|
|
|
transformed = grouped.transform(lambda x: x.mean()) |
|
for name, group in grouped: |
|
mean = group.mean() |
|
for idx in group.index: |
|
tm.assert_series_equal(transformed.xs(idx), mean, check_names=False) |
|
|
|
|
|
for weekday, group in grouped: |
|
assert group.index[0].weekday() == weekday |
|
|
|
|
|
groups = grouped.groups |
|
indices = grouped.indices |
|
|
|
for k, v in groups.items(): |
|
samething = tsframe.index.take(indices[k]) |
|
assert (samething == v).all() |
|
|
|
|
|
def test_frame_groupby_columns(tsframe): |
|
mapping = {"A": 0, "B": 0, "C": 1, "D": 1} |
|
msg = "DataFrame.groupby with axis=1 is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
grouped = tsframe.groupby(mapping, axis=1) |
|
|
|
|
|
aggregated = grouped.aggregate("mean") |
|
assert len(aggregated) == len(tsframe) |
|
assert len(aggregated.columns) == 2 |
|
|
|
|
|
tf = lambda x: x - x.mean() |
|
msg = "The 'axis' keyword in DataFrame.groupby is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
groupedT = tsframe.T.groupby(mapping, axis=0) |
|
tm.assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf)) |
|
|
|
|
|
for k, v in grouped: |
|
assert len(v.columns) == 2 |
|
|
|
|
|
def test_frame_set_name_single(df): |
|
grouped = df.groupby("A") |
|
|
|
result = grouped.mean(numeric_only=True) |
|
assert result.index.name == "A" |
|
|
|
result = df.groupby("A", as_index=False).mean(numeric_only=True) |
|
assert result.index.name != "A" |
|
|
|
result = grouped[["C", "D"]].agg("mean") |
|
assert result.index.name == "A" |
|
|
|
result = grouped.agg({"C": "mean", "D": "std"}) |
|
assert result.index.name == "A" |
|
|
|
result = grouped["C"].mean() |
|
assert result.index.name == "A" |
|
result = grouped["C"].agg("mean") |
|
assert result.index.name == "A" |
|
result = grouped["C"].agg(["mean", "std"]) |
|
assert result.index.name == "A" |
|
|
|
msg = r"nested renamer is not supported" |
|
with pytest.raises(SpecificationError, match=msg): |
|
grouped["C"].agg({"foo": "mean", "bar": "std"}) |
|
|
|
|
|
def test_multi_func(df): |
|
col1 = df["A"] |
|
col2 = df["B"] |
|
|
|
grouped = df.groupby([col1.get, col2.get]) |
|
agged = grouped.mean(numeric_only=True) |
|
expected = df.groupby(["A", "B"]).mean() |
|
|
|
|
|
tm.assert_frame_equal( |
|
agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False |
|
) |
|
|
|
|
|
df = DataFrame( |
|
{ |
|
"v1": np.random.default_rng(2).standard_normal(6), |
|
"v2": np.random.default_rng(2).standard_normal(6), |
|
"k1": np.array(["b", "b", "b", "a", "a", "a"]), |
|
"k2": np.array(["1", "1", "1", "2", "2", "2"]), |
|
}, |
|
index=["one", "two", "three", "four", "five", "six"], |
|
) |
|
|
|
grouped = df.groupby(["k1", "k2"]) |
|
grouped.agg("sum") |
|
|
|
|
|
def test_multi_key_multiple_functions(df): |
|
grouped = df.groupby(["A", "B"])["C"] |
|
|
|
agged = grouped.agg(["mean", "std"]) |
|
expected = DataFrame({"mean": grouped.agg("mean"), "std": grouped.agg("std")}) |
|
tm.assert_frame_equal(agged, expected) |
|
|
|
|
|
def test_frame_multi_key_function_list(): |
|
data = DataFrame( |
|
{ |
|
"A": [ |
|
"foo", |
|
"foo", |
|
"foo", |
|
"foo", |
|
"bar", |
|
"bar", |
|
"bar", |
|
"bar", |
|
"foo", |
|
"foo", |
|
"foo", |
|
], |
|
"B": [ |
|
"one", |
|
"one", |
|
"one", |
|
"two", |
|
"one", |
|
"one", |
|
"one", |
|
"two", |
|
"two", |
|
"two", |
|
"one", |
|
], |
|
"D": np.random.default_rng(2).standard_normal(11), |
|
"E": np.random.default_rng(2).standard_normal(11), |
|
"F": np.random.default_rng(2).standard_normal(11), |
|
} |
|
) |
|
|
|
grouped = data.groupby(["A", "B"]) |
|
funcs = ["mean", "std"] |
|
agged = grouped.agg(funcs) |
|
expected = pd.concat( |
|
[grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)], |
|
keys=["D", "E", "F"], |
|
axis=1, |
|
) |
|
assert isinstance(agged.index, MultiIndex) |
|
assert isinstance(expected.index, MultiIndex) |
|
tm.assert_frame_equal(agged, expected) |
|
|
|
|
|
def test_frame_multi_key_function_list_partial_failure(): |
|
data = DataFrame( |
|
{ |
|
"A": [ |
|
"foo", |
|
"foo", |
|
"foo", |
|
"foo", |
|
"bar", |
|
"bar", |
|
"bar", |
|
"bar", |
|
"foo", |
|
"foo", |
|
"foo", |
|
], |
|
"B": [ |
|
"one", |
|
"one", |
|
"one", |
|
"two", |
|
"one", |
|
"one", |
|
"one", |
|
"two", |
|
"two", |
|
"two", |
|
"one", |
|
], |
|
"C": [ |
|
"dull", |
|
"dull", |
|
"shiny", |
|
"dull", |
|
"dull", |
|
"shiny", |
|
"shiny", |
|
"dull", |
|
"shiny", |
|
"shiny", |
|
"shiny", |
|
], |
|
"D": np.random.default_rng(2).standard_normal(11), |
|
"E": np.random.default_rng(2).standard_normal(11), |
|
"F": np.random.default_rng(2).standard_normal(11), |
|
} |
|
) |
|
|
|
grouped = data.groupby(["A", "B"]) |
|
funcs = ["mean", "std"] |
|
msg = re.escape("agg function failed [how->mean,dtype->") |
|
with pytest.raises(TypeError, match=msg): |
|
grouped.agg(funcs) |
|
|
|
|
|
@pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()]) |
|
def test_groupby_multiple_columns(df, op): |
|
data = df |
|
grouped = data.groupby(["A", "B"]) |
|
|
|
result1 = op(grouped) |
|
|
|
keys = [] |
|
values = [] |
|
for n1, gp1 in data.groupby("A"): |
|
for n2, gp2 in gp1.groupby("B"): |
|
keys.append((n1, n2)) |
|
values.append(op(gp2.loc[:, ["C", "D"]])) |
|
|
|
mi = MultiIndex.from_tuples(keys, names=["A", "B"]) |
|
expected = pd.concat(values, axis=1).T |
|
expected.index = mi |
|
|
|
|
|
for col in ["C", "D"]: |
|
result_col = op(grouped[col]) |
|
pivoted = result1[col] |
|
exp = expected[col] |
|
tm.assert_series_equal(result_col, exp) |
|
tm.assert_series_equal(pivoted, exp) |
|
|
|
|
|
result = data["C"].groupby([data["A"], data["B"]]).mean() |
|
expected = data.groupby(["A", "B"]).mean()["C"] |
|
|
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_as_index_select_column(): |
|
|
|
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"]) |
|
result = df.groupby("A", as_index=False)["B"].get_group(1) |
|
expected = Series([2, 4], name="B") |
|
tm.assert_series_equal(result, expected) |
|
|
|
result = df.groupby("A", as_index=False, group_keys=True)["B"].apply( |
|
lambda x: x.cumsum() |
|
) |
|
expected = Series( |
|
[2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)]) |
|
) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_obj_arg_get_group_deprecated(): |
|
depr_msg = "obj is deprecated" |
|
|
|
df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5]}) |
|
expected = df.iloc[df.groupby("b").indices.get(4)] |
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
result = df.groupby("b").get_group(4, obj=df) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_as_index_select_column_sum_empty_df(): |
|
|
|
df = DataFrame(columns=Index(["A", "B", "C"], name="alpha")) |
|
left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False) |
|
|
|
expected = DataFrame(columns=df.columns[:2], index=range(0)) |
|
|
|
expected.columns.names = [None] |
|
tm.assert_frame_equal(left, expected) |
|
|
|
|
|
def test_groupby_as_index_agg(df): |
|
grouped = df.groupby("A", as_index=False) |
|
|
|
|
|
|
|
result = grouped[["C", "D"]].agg("mean") |
|
expected = grouped.mean(numeric_only=True) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result2 = grouped.agg({"C": "mean", "D": "sum"}) |
|
expected2 = grouped.mean(numeric_only=True) |
|
expected2["D"] = grouped.sum()["D"] |
|
tm.assert_frame_equal(result2, expected2) |
|
|
|
grouped = df.groupby("A", as_index=True) |
|
|
|
msg = r"nested renamer is not supported" |
|
with pytest.raises(SpecificationError, match=msg): |
|
grouped["C"].agg({"Q": "sum"}) |
|
|
|
|
|
|
|
grouped = df.groupby(["A", "B"], as_index=False) |
|
|
|
result = grouped.agg("mean") |
|
expected = grouped.mean() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result2 = grouped.agg({"C": "mean", "D": "sum"}) |
|
expected2 = grouped.mean() |
|
expected2["D"] = grouped.sum()["D"] |
|
tm.assert_frame_equal(result2, expected2) |
|
|
|
expected3 = grouped["C"].sum() |
|
expected3 = DataFrame(expected3).rename(columns={"C": "Q"}) |
|
msg = "Passing a dictionary to SeriesGroupBy.agg is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
result3 = grouped["C"].agg({"Q": "sum"}) |
|
tm.assert_frame_equal(result3, expected3) |
|
|
|
|
|
df = DataFrame( |
|
np.random.default_rng(2).integers(0, 100, (50, 3)), |
|
columns=["jim", "joe", "jolie"], |
|
) |
|
ts = Series(np.random.default_rng(2).integers(5, 10, 50), name="jim") |
|
|
|
gr = df.groupby(ts) |
|
gr.nth(0) |
|
|
|
msg = "The behavior of DataFrame.sum with axis=None is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): |
|
res = gr.apply(sum) |
|
with tm.assert_produces_warning(FutureWarning, match=msg, check_stacklevel=False): |
|
alt = df.groupby(ts).apply(sum) |
|
tm.assert_frame_equal(res, alt) |
|
|
|
for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]: |
|
gr = df.groupby(ts, as_index=False) |
|
left = getattr(gr, attr)() |
|
|
|
gr = df.groupby(ts.values, as_index=True) |
|
right = getattr(gr, attr)().reset_index(drop=True) |
|
|
|
tm.assert_frame_equal(left, right) |
|
|
|
|
|
def test_ops_not_as_index(reduction_func): |
|
|
|
|
|
|
|
if reduction_func in ("corrwith", "nth", "ngroup"): |
|
pytest.skip(f"GH 5755: Test not applicable for {reduction_func}") |
|
|
|
df = DataFrame( |
|
np.random.default_rng(2).integers(0, 5, size=(100, 2)), columns=["a", "b"] |
|
) |
|
expected = getattr(df.groupby("a"), reduction_func)() |
|
if reduction_func == "size": |
|
expected = expected.rename("size") |
|
expected = expected.reset_index() |
|
|
|
if reduction_func != "size": |
|
|
|
expected["a"] = expected["a"].astype(df["a"].dtype) |
|
|
|
g = df.groupby("a", as_index=False) |
|
|
|
result = getattr(g, reduction_func)() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = g.agg(reduction_func) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = getattr(g["b"], reduction_func)() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = g["b"].agg(reduction_func) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_as_index_series_return_frame(df): |
|
grouped = df.groupby("A", as_index=False) |
|
grouped2 = df.groupby(["A", "B"], as_index=False) |
|
|
|
result = grouped["C"].agg("sum") |
|
expected = grouped.agg("sum").loc[:, ["A", "C"]] |
|
assert isinstance(result, DataFrame) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result2 = grouped2["C"].agg("sum") |
|
expected2 = grouped2.agg("sum").loc[:, ["A", "B", "C"]] |
|
assert isinstance(result2, DataFrame) |
|
tm.assert_frame_equal(result2, expected2) |
|
|
|
result = grouped["C"].sum() |
|
expected = grouped.sum().loc[:, ["A", "C"]] |
|
assert isinstance(result, DataFrame) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result2 = grouped2["C"].sum() |
|
expected2 = grouped2.sum().loc[:, ["A", "B", "C"]] |
|
assert isinstance(result2, DataFrame) |
|
tm.assert_frame_equal(result2, expected2) |
|
|
|
|
|
def test_as_index_series_column_slice_raises(df): |
|
|
|
grouped = df.groupby("A", as_index=False) |
|
msg = r"Column\(s\) C already selected" |
|
|
|
with pytest.raises(IndexError, match=msg): |
|
grouped["C"].__getitem__("D") |
|
|
|
|
|
def test_groupby_as_index_cython(df): |
|
data = df |
|
|
|
|
|
grouped = data.groupby("A", as_index=False) |
|
result = grouped.mean(numeric_only=True) |
|
expected = data.groupby(["A"]).mean(numeric_only=True) |
|
expected.insert(0, "A", expected.index) |
|
expected.index = RangeIndex(len(expected)) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
grouped = data.groupby(["A", "B"], as_index=False) |
|
result = grouped.mean() |
|
expected = data.groupby(["A", "B"]).mean() |
|
|
|
arrays = list(zip(*expected.index.values)) |
|
expected.insert(0, "A", arrays[0]) |
|
expected.insert(1, "B", arrays[1]) |
|
expected.index = RangeIndex(len(expected)) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_as_index_series_scalar(df): |
|
grouped = df.groupby(["A", "B"], as_index=False) |
|
|
|
|
|
|
|
result = grouped["C"].agg(len) |
|
expected = grouped.agg(len).loc[:, ["A", "B", "C"]] |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_as_index_corner(df, ts): |
|
msg = "as_index=False only valid with DataFrame" |
|
with pytest.raises(TypeError, match=msg): |
|
ts.groupby(lambda x: x.weekday(), as_index=False) |
|
|
|
msg = "as_index=False only valid for axis=0" |
|
depr_msg = "DataFrame.groupby with axis=1 is deprecated" |
|
with pytest.raises(ValueError, match=msg): |
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
df.groupby(lambda x: x.lower(), as_index=False, axis=1) |
|
|
|
|
|
def test_groupby_multiple_key(): |
|
df = DataFrame( |
|
np.random.default_rng(2).standard_normal((10, 4)), |
|
columns=Index(list("ABCD"), dtype=object), |
|
index=date_range("2000-01-01", periods=10, freq="B"), |
|
) |
|
grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]) |
|
agged = grouped.sum() |
|
tm.assert_almost_equal(df.values, agged.values) |
|
|
|
depr_msg = "DataFrame.groupby with axis=1 is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
grouped = df.T.groupby( |
|
[lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1 |
|
) |
|
|
|
agged = grouped.agg(lambda x: x.sum()) |
|
tm.assert_index_equal(agged.index, df.columns) |
|
tm.assert_almost_equal(df.T.values, agged.values) |
|
|
|
agged = grouped.agg(lambda x: x.sum()) |
|
tm.assert_almost_equal(df.T.values, agged.values) |
|
|
|
|
|
def test_groupby_multi_corner(df): |
|
|
|
df = df.copy() |
|
df["bad"] = np.nan |
|
agged = df.groupby(["A", "B"]).mean() |
|
|
|
expected = df.groupby(["A", "B"]).mean() |
|
expected["bad"] = np.nan |
|
|
|
tm.assert_frame_equal(agged, expected) |
|
|
|
|
|
def test_raises_on_nuisance(df): |
|
grouped = df.groupby("A") |
|
msg = re.escape("agg function failed [how->mean,dtype->") |
|
with pytest.raises(TypeError, match=msg): |
|
grouped.agg("mean") |
|
with pytest.raises(TypeError, match=msg): |
|
grouped.mean() |
|
|
|
df = df.loc[:, ["A", "C", "D"]] |
|
df["E"] = datetime.now() |
|
grouped = df.groupby("A") |
|
msg = "datetime64 type does not support sum operations" |
|
with pytest.raises(TypeError, match=msg): |
|
grouped.agg("sum") |
|
with pytest.raises(TypeError, match=msg): |
|
grouped.sum() |
|
|
|
|
|
depr_msg = "DataFrame.groupby with axis=1 is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) |
|
msg = "does not support reduction 'sum'" |
|
with pytest.raises(TypeError, match=msg): |
|
grouped.agg(lambda x: x.sum(0, numeric_only=False)) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"agg_function", |
|
["max", "min"], |
|
) |
|
def test_keep_nuisance_agg(df, agg_function): |
|
|
|
grouped = df.groupby("A") |
|
result = getattr(grouped, agg_function)() |
|
expected = result.copy() |
|
expected.loc["bar", "B"] = getattr(df.loc[df["A"] == "bar", "B"], agg_function)() |
|
expected.loc["foo", "B"] = getattr(df.loc[df["A"] == "foo", "B"], agg_function)() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"agg_function", |
|
["sum", "mean", "prod", "std", "var", "sem", "median"], |
|
) |
|
@pytest.mark.parametrize("numeric_only", [True, False]) |
|
def test_omit_nuisance_agg(df, agg_function, numeric_only): |
|
|
|
grouped = df.groupby("A") |
|
|
|
no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median") |
|
if agg_function in no_drop_nuisance and not numeric_only: |
|
|
|
|
|
if agg_function in ("std", "sem"): |
|
klass = ValueError |
|
msg = "could not convert string to float: 'one'" |
|
else: |
|
klass = TypeError |
|
msg = re.escape(f"agg function failed [how->{agg_function},dtype->") |
|
with pytest.raises(klass, match=msg): |
|
getattr(grouped, agg_function)(numeric_only=numeric_only) |
|
else: |
|
result = getattr(grouped, agg_function)(numeric_only=numeric_only) |
|
if not numeric_only and agg_function == "sum": |
|
|
|
columns = ["A", "B", "C", "D"] |
|
else: |
|
columns = ["A", "C", "D"] |
|
expected = getattr(df.loc[:, columns].groupby("A"), agg_function)( |
|
numeric_only=numeric_only |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_raise_on_nuisance_python_single(df): |
|
|
|
grouped = df.groupby("A") |
|
with pytest.raises(ValueError, match="could not convert"): |
|
grouped.skew() |
|
|
|
|
|
def test_raise_on_nuisance_python_multiple(three_group): |
|
grouped = three_group.groupby(["A", "B"]) |
|
msg = re.escape("agg function failed [how->mean,dtype->") |
|
with pytest.raises(TypeError, match=msg): |
|
grouped.agg("mean") |
|
with pytest.raises(TypeError, match=msg): |
|
grouped.mean() |
|
|
|
|
|
def test_empty_groups_corner(multiindex_dataframe_random_data): |
|
|
|
df = DataFrame( |
|
{ |
|
"k1": np.array(["b", "b", "b", "a", "a", "a"]), |
|
"k2": np.array(["1", "1", "1", "2", "2", "2"]), |
|
"k3": ["foo", "bar"] * 3, |
|
"v1": np.random.default_rng(2).standard_normal(6), |
|
"v2": np.random.default_rng(2).standard_normal(6), |
|
} |
|
) |
|
|
|
grouped = df.groupby(["k1", "k2"]) |
|
result = grouped[["v1", "v2"]].agg("mean") |
|
expected = grouped.mean(numeric_only=True) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
grouped = multiindex_dataframe_random_data[3:5].groupby(level=0) |
|
agged = grouped.apply(lambda x: x.mean()) |
|
agged_A = grouped["A"].apply("mean") |
|
tm.assert_series_equal(agged["A"], agged_A) |
|
assert agged.index.name == "first" |
|
|
|
|
|
def test_nonsense_func(): |
|
df = DataFrame([0]) |
|
msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'" |
|
with pytest.raises(TypeError, match=msg): |
|
df.groupby(lambda x: x + "foo") |
|
|
|
|
|
def test_wrap_aggregated_output_multindex(multiindex_dataframe_random_data): |
|
df = multiindex_dataframe_random_data.T |
|
df["baz", "two"] = "peekaboo" |
|
|
|
keys = [np.array([0, 0, 1]), np.array([0, 0, 1])] |
|
msg = re.escape("agg function failed [how->mean,dtype->") |
|
with pytest.raises(TypeError, match=msg): |
|
df.groupby(keys).agg("mean") |
|
agged = df.drop(columns=("baz", "two")).groupby(keys).agg("mean") |
|
assert isinstance(agged.columns, MultiIndex) |
|
|
|
def aggfun(ser): |
|
if ser.name == ("foo", "one"): |
|
raise TypeError("Test error message") |
|
return ser.sum() |
|
|
|
with pytest.raises(TypeError, match="Test error message"): |
|
df.groupby(keys).aggregate(aggfun) |
|
|
|
|
|
def test_groupby_level_apply(multiindex_dataframe_random_data): |
|
result = multiindex_dataframe_random_data.groupby(level=0).count() |
|
assert result.index.name == "first" |
|
result = multiindex_dataframe_random_data.groupby(level=1).count() |
|
assert result.index.name == "second" |
|
|
|
result = multiindex_dataframe_random_data["A"].groupby(level=0).count() |
|
assert result.index.name == "first" |
|
|
|
|
|
def test_groupby_level_mapper(multiindex_dataframe_random_data): |
|
deleveled = multiindex_dataframe_random_data.reset_index() |
|
|
|
mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1} |
|
mapper1 = {"one": 0, "two": 0, "three": 1} |
|
|
|
result0 = multiindex_dataframe_random_data.groupby(mapper0, level=0).sum() |
|
result1 = multiindex_dataframe_random_data.groupby(mapper1, level=1).sum() |
|
|
|
mapped_level0 = np.array( |
|
[mapper0.get(x) for x in deleveled["first"]], dtype=np.int64 |
|
) |
|
mapped_level1 = np.array( |
|
[mapper1.get(x) for x in deleveled["second"]], dtype=np.int64 |
|
) |
|
expected0 = multiindex_dataframe_random_data.groupby(mapped_level0).sum() |
|
expected1 = multiindex_dataframe_random_data.groupby(mapped_level1).sum() |
|
expected0.index.name, expected1.index.name = "first", "second" |
|
|
|
tm.assert_frame_equal(result0, expected0) |
|
tm.assert_frame_equal(result1, expected1) |
|
|
|
|
|
def test_groupby_level_nonmulti(): |
|
|
|
s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo")) |
|
expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo")) |
|
|
|
result = s.groupby(level=0).sum() |
|
tm.assert_series_equal(result, expected) |
|
result = s.groupby(level=[0]).sum() |
|
tm.assert_series_equal(result, expected) |
|
result = s.groupby(level=-1).sum() |
|
tm.assert_series_equal(result, expected) |
|
result = s.groupby(level=[-1]).sum() |
|
tm.assert_series_equal(result, expected) |
|
|
|
msg = "level > 0 or level < -1 only valid with MultiIndex" |
|
with pytest.raises(ValueError, match=msg): |
|
s.groupby(level=1) |
|
with pytest.raises(ValueError, match=msg): |
|
s.groupby(level=-2) |
|
msg = "No group keys passed!" |
|
with pytest.raises(ValueError, match=msg): |
|
s.groupby(level=[]) |
|
msg = "multiple levels only valid with MultiIndex" |
|
with pytest.raises(ValueError, match=msg): |
|
s.groupby(level=[0, 0]) |
|
with pytest.raises(ValueError, match=msg): |
|
s.groupby(level=[0, 1]) |
|
msg = "level > 0 or level < -1 only valid with MultiIndex" |
|
with pytest.raises(ValueError, match=msg): |
|
s.groupby(level=[1]) |
|
|
|
|
|
def test_groupby_complex(): |
|
|
|
a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1]) |
|
expected = Series((1 + 2j, 5 + 10j)) |
|
|
|
result = a.groupby(level=0).sum() |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_groupby_complex_mean(): |
|
|
|
df = DataFrame( |
|
[ |
|
{"a": 2, "b": 1 + 2j}, |
|
{"a": 1, "b": 1 + 1j}, |
|
{"a": 1, "b": 1 + 2j}, |
|
] |
|
) |
|
result = df.groupby("b").mean() |
|
expected = DataFrame( |
|
[[1.0], [1.5]], |
|
index=Index([(1 + 1j), (1 + 2j)], name="b"), |
|
columns=Index(["a"]), |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_complex_numbers(using_infer_string): |
|
|
|
df = DataFrame( |
|
[ |
|
{"a": 1, "b": 1 + 1j}, |
|
{"a": 1, "b": 1 + 2j}, |
|
{"a": 4, "b": 1}, |
|
] |
|
) |
|
dtype = "string[pyarrow_numpy]" if using_infer_string else object |
|
expected = DataFrame( |
|
np.array([1, 1, 1], dtype=np.int64), |
|
index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"), |
|
columns=Index(["a"], dtype=dtype), |
|
) |
|
result = df.groupby("b", sort=False).count() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
expected.index = Index([(1 + 0j), (1 + 1j), (1 + 2j)], name="b") |
|
result = df.groupby("b", sort=True).count() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_series_indexed_differently(): |
|
s1 = Series( |
|
[5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7], |
|
index=Index(["a", "b", "c", "d", "e", "f", "g"]), |
|
) |
|
s2 = Series( |
|
[1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"]) |
|
) |
|
|
|
grouped = s1.groupby(s2) |
|
agged = grouped.mean() |
|
exp = s1.groupby(s2.reindex(s1.index).get).mean() |
|
tm.assert_series_equal(agged, exp) |
|
|
|
|
|
def test_groupby_with_hier_columns(): |
|
tuples = list( |
|
zip( |
|
*[ |
|
["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"], |
|
["one", "two", "one", "two", "one", "two", "one", "two"], |
|
] |
|
) |
|
) |
|
index = MultiIndex.from_tuples(tuples) |
|
columns = MultiIndex.from_tuples( |
|
[("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")] |
|
) |
|
df = DataFrame( |
|
np.random.default_rng(2).standard_normal((8, 4)), index=index, columns=columns |
|
) |
|
|
|
result = df.groupby(level=0).mean() |
|
tm.assert_index_equal(result.columns, columns) |
|
|
|
depr_msg = "DataFrame.groupby with axis=1 is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
gb = df.groupby(level=0, axis=1) |
|
result = gb.mean() |
|
tm.assert_index_equal(result.index, df.index) |
|
|
|
result = df.groupby(level=0).agg("mean") |
|
tm.assert_index_equal(result.columns, columns) |
|
|
|
result = df.groupby(level=0).apply(lambda x: x.mean()) |
|
tm.assert_index_equal(result.columns, columns) |
|
|
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
gb = df.groupby(level=0, axis=1) |
|
result = gb.agg(lambda x: x.mean(1)) |
|
tm.assert_index_equal(result.columns, Index(["A", "B"])) |
|
tm.assert_index_equal(result.index, df.index) |
|
|
|
|
|
sorted_columns, _ = columns.sortlevel(0) |
|
df["A", "foo"] = "bar" |
|
result = df.groupby(level=0).mean(numeric_only=True) |
|
tm.assert_index_equal(result.columns, df.columns[:-1]) |
|
|
|
|
|
def test_grouping_ndarray(df): |
|
grouped = df.groupby(df["A"].values) |
|
result = grouped.sum() |
|
expected = df.groupby(df["A"].rename(None)).sum() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_wrong_multi_labels(): |
|
index = Index([0, 1, 2, 3, 4], name="index") |
|
data = DataFrame( |
|
{ |
|
"foo": ["foo1", "foo1", "foo2", "foo1", "foo3"], |
|
"bar": ["bar1", "bar2", "bar2", "bar1", "bar1"], |
|
"baz": ["baz1", "baz1", "baz1", "baz2", "baz2"], |
|
"spam": ["spam2", "spam3", "spam2", "spam1", "spam1"], |
|
"data": [20, 30, 40, 50, 60], |
|
}, |
|
index=index, |
|
) |
|
|
|
grouped = data.groupby(["foo", "bar", "baz", "spam"]) |
|
|
|
result = grouped.agg("mean") |
|
expected = grouped.mean() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_series_with_name(df): |
|
result = df.groupby(df["A"]).mean(numeric_only=True) |
|
result2 = df.groupby(df["A"], as_index=False).mean(numeric_only=True) |
|
assert result.index.name == "A" |
|
assert "A" in result2 |
|
|
|
result = df.groupby([df["A"], df["B"]]).mean() |
|
result2 = df.groupby([df["A"], df["B"]], as_index=False).mean() |
|
assert result.index.names == ("A", "B") |
|
assert "A" in result2 |
|
assert "B" in result2 |
|
|
|
|
|
def test_seriesgroupby_name_attr(df): |
|
|
|
result = df.groupby("A")["C"] |
|
assert result.count().name == "C" |
|
assert result.mean().name == "C" |
|
|
|
testFunc = lambda x: np.sum(x) * 2 |
|
assert result.agg(testFunc).name == "C" |
|
|
|
|
|
def test_consistency_name(): |
|
|
|
|
|
df = DataFrame( |
|
{ |
|
"A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"], |
|
"B": ["one", "one", "two", "two", "two", "two", "one", "two"], |
|
"C": np.random.default_rng(2).standard_normal(8) + 1.0, |
|
"D": np.arange(8), |
|
} |
|
) |
|
|
|
expected = df.groupby(["A"]).B.count() |
|
result = df.B.groupby(df.A).count() |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_groupby_name_propagation(df): |
|
|
|
def summarize(df, name=None): |
|
return Series({"count": 1, "mean": 2, "omissions": 3}, name=name) |
|
|
|
def summarize_random_name(df): |
|
|
|
|
|
|
|
return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"]) |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
metrics = df.groupby("A").apply(summarize) |
|
assert metrics.columns.name is None |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
metrics = df.groupby("A").apply(summarize, "metrics") |
|
assert metrics.columns.name == "metrics" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
metrics = df.groupby("A").apply(summarize_random_name) |
|
assert metrics.columns.name is None |
|
|
|
|
|
def test_groupby_nonstring_columns(): |
|
df = DataFrame([np.arange(10) for x in range(10)]) |
|
grouped = df.groupby(0) |
|
result = grouped.mean() |
|
expected = df.groupby(df[0]).mean() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_mixed_type_columns(): |
|
|
|
df = DataFrame([[0, 1, 2]], columns=["A", "B", 0]) |
|
expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A")) |
|
|
|
result = df.groupby("A").first() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = df.groupby("A").sum() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_cython_grouper_series_bug_noncontig(): |
|
arr = np.empty((100, 100)) |
|
arr.fill(np.nan) |
|
obj = Series(arr[:, 0]) |
|
inds = np.tile(range(10), 10) |
|
|
|
result = obj.groupby(inds).agg(Series.median) |
|
assert result.isna().all() |
|
|
|
|
|
def test_series_grouper_noncontig_index(): |
|
index = Index(["a" * 10] * 100) |
|
|
|
values = Series(np.random.default_rng(2).standard_normal(50), index=index[::2]) |
|
labels = np.random.default_rng(2).integers(0, 5, 50) |
|
|
|
|
|
grouped = values.groupby(labels) |
|
|
|
|
|
f = lambda x: len(set(map(id, x.index))) |
|
grouped.agg(f) |
|
|
|
|
|
def test_convert_objects_leave_decimal_alone(): |
|
s = Series(range(5)) |
|
labels = np.array(["a", "b", "c", "d", "e"], dtype="O") |
|
|
|
def convert_fast(x): |
|
return Decimal(str(x.mean())) |
|
|
|
def convert_force_pure(x): |
|
|
|
assert len(x.values.base) > 0 |
|
return Decimal(str(x.mean())) |
|
|
|
grouped = s.groupby(labels) |
|
|
|
result = grouped.agg(convert_fast) |
|
assert result.dtype == np.object_ |
|
assert isinstance(result.iloc[0], Decimal) |
|
|
|
result = grouped.agg(convert_force_pure) |
|
assert result.dtype == np.object_ |
|
assert isinstance(result.iloc[0], Decimal) |
|
|
|
|
|
def test_groupby_dtype_inference_empty(): |
|
|
|
df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")}) |
|
assert df["x"].dtype == np.float64 |
|
|
|
result = df.groupby("x").first() |
|
exp_index = Index([], name="x", dtype=np.float64) |
|
expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")}) |
|
tm.assert_frame_equal(result, expected, by_blocks=True) |
|
|
|
|
|
def test_groupby_unit64_float_conversion(): |
|
|
|
df = DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]}) |
|
result = df.groupby(["first", "second"])["value"].max() |
|
expected = Series( |
|
[16148277970000000000], |
|
MultiIndex.from_product([[1], [1]], names=["first", "second"]), |
|
name="value", |
|
) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_groupby_list_infer_array_like(df): |
|
result = df.groupby(list(df["A"])).mean(numeric_only=True) |
|
expected = df.groupby(df["A"]).mean(numeric_only=True) |
|
tm.assert_frame_equal(result, expected, check_names=False) |
|
|
|
with pytest.raises(KeyError, match=r"^'foo'$"): |
|
df.groupby(list(df["A"][:-1])) |
|
|
|
|
|
df = DataFrame( |
|
{ |
|
"foo": [0, 1], |
|
"bar": [3, 4], |
|
"val": np.random.default_rng(2).standard_normal(2), |
|
} |
|
) |
|
|
|
result = df.groupby(["foo", "bar"]).mean() |
|
expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]] |
|
|
|
|
|
def test_groupby_keys_same_size_as_index(): |
|
|
|
freq = "s" |
|
index = date_range( |
|
start=Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq |
|
) |
|
df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index) |
|
result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean() |
|
expected = df.set_index([df.index, "metric"]).astype(float) |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_one_row(): |
|
|
|
msg = r"^'Z'$" |
|
df1 = DataFrame( |
|
np.random.default_rng(2).standard_normal((1, 4)), columns=list("ABCD") |
|
) |
|
with pytest.raises(KeyError, match=msg): |
|
df1.groupby("Z") |
|
df2 = DataFrame( |
|
np.random.default_rng(2).standard_normal((2, 4)), columns=list("ABCD") |
|
) |
|
with pytest.raises(KeyError, match=msg): |
|
df2.groupby("Z") |
|
|
|
|
|
def test_groupby_nat_exclude(): |
|
|
|
df = DataFrame( |
|
{ |
|
"values": np.random.default_rng(2).standard_normal(8), |
|
"dt": [ |
|
np.nan, |
|
Timestamp("2013-01-01"), |
|
np.nan, |
|
Timestamp("2013-02-01"), |
|
np.nan, |
|
Timestamp("2013-02-01"), |
|
np.nan, |
|
Timestamp("2013-01-01"), |
|
], |
|
"str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"], |
|
} |
|
) |
|
grouped = df.groupby("dt") |
|
|
|
expected = [Index([1, 7]), Index([3, 5])] |
|
keys = sorted(grouped.groups.keys()) |
|
assert len(keys) == 2 |
|
for k, e in zip(keys, expected): |
|
|
|
|
|
tm.assert_index_equal(grouped.groups[k], e) |
|
|
|
|
|
tm.assert_frame_equal(grouped._grouper.groupings[0].obj, df) |
|
assert grouped.ngroups == 2 |
|
|
|
expected = { |
|
Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp), |
|
Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp), |
|
} |
|
|
|
for k in grouped.indices: |
|
tm.assert_numpy_array_equal(grouped.indices[k], expected[k]) |
|
|
|
tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]]) |
|
tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]]) |
|
|
|
with pytest.raises(KeyError, match=r"^NaT$"): |
|
grouped.get_group(pd.NaT) |
|
|
|
nan_df = DataFrame( |
|
{"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]} |
|
) |
|
assert nan_df["nan"].dtype == "float64" |
|
assert nan_df["nat"].dtype == "datetime64[ns]" |
|
|
|
for key in ["nan", "nat"]: |
|
grouped = nan_df.groupby(key) |
|
assert grouped.groups == {} |
|
assert grouped.ngroups == 0 |
|
assert grouped.indices == {} |
|
with pytest.raises(KeyError, match=r"^nan$"): |
|
grouped.get_group(np.nan) |
|
with pytest.raises(KeyError, match=r"^NaT$"): |
|
grouped.get_group(pd.NaT) |
|
|
|
|
|
def test_groupby_two_group_keys_all_nan(): |
|
|
|
df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]}) |
|
result = df.groupby(["a", "b"]).indices |
|
assert result == {} |
|
|
|
|
|
def test_groupby_2d_malformed(): |
|
d = DataFrame(index=range(2)) |
|
d["group"] = ["g1", "g2"] |
|
d["zeros"] = [0, 0] |
|
d["ones"] = [1, 1] |
|
d["label"] = ["l1", "l2"] |
|
tmp = d.groupby(["group"]).mean(numeric_only=True) |
|
res_values = np.array([[0.0, 1.0], [0.0, 1.0]]) |
|
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"])) |
|
tm.assert_numpy_array_equal(tmp.values, res_values) |
|
|
|
|
|
def test_int32_overflow(): |
|
B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000))) |
|
A = np.arange(25000) |
|
df = DataFrame( |
|
{ |
|
"A": A, |
|
"B": B, |
|
"C": A, |
|
"D": B, |
|
"E": np.random.default_rng(2).standard_normal(25000), |
|
} |
|
) |
|
|
|
left = df.groupby(["A", "B", "C", "D"]).sum() |
|
right = df.groupby(["D", "C", "B", "A"]).sum() |
|
assert len(left) == len(right) |
|
|
|
|
|
def test_groupby_sort_multi(): |
|
df = DataFrame( |
|
{ |
|
"a": ["foo", "bar", "baz"], |
|
"b": [3, 2, 1], |
|
"c": [0, 1, 2], |
|
"d": np.random.default_rng(2).standard_normal(3), |
|
} |
|
) |
|
|
|
tups = [tuple(row) for row in df[["a", "b", "c"]].values] |
|
tups = com.asarray_tuplesafe(tups) |
|
result = df.groupby(["a", "b", "c"], sort=True).sum() |
|
tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]]) |
|
|
|
tups = [tuple(row) for row in df[["c", "a", "b"]].values] |
|
tups = com.asarray_tuplesafe(tups) |
|
result = df.groupby(["c", "a", "b"], sort=True).sum() |
|
tm.assert_numpy_array_equal(result.index.values, tups) |
|
|
|
tups = [tuple(x) for x in df[["b", "c", "a"]].values] |
|
tups = com.asarray_tuplesafe(tups) |
|
result = df.groupby(["b", "c", "a"], sort=True).sum() |
|
tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]]) |
|
|
|
df = DataFrame( |
|
{ |
|
"a": [0, 1, 2, 0, 1, 2], |
|
"b": [0, 0, 0, 1, 1, 1], |
|
"d": np.random.default_rng(2).standard_normal(6), |
|
} |
|
) |
|
grouped = df.groupby(["a", "b"])["d"] |
|
result = grouped.sum() |
|
|
|
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): |
|
tups = [tuple(row) for row in df[keys].values] |
|
tups = com.asarray_tuplesafe(tups) |
|
expected = f(df.groupby(tups)[field]) |
|
for k, v in expected.items(): |
|
assert result[k] == v |
|
|
|
_check_groupby(df, result, ["a", "b"], "d") |
|
|
|
|
|
def test_dont_clobber_name_column(): |
|
df = DataFrame( |
|
{"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2} |
|
) |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
result = df.groupby("key", group_keys=False).apply(lambda x: x) |
|
tm.assert_frame_equal(result, df) |
|
|
|
|
|
def test_skip_group_keys(): |
|
tsf = DataFrame( |
|
np.random.default_rng(2).standard_normal((10, 4)), |
|
columns=Index(list("ABCD"), dtype=object), |
|
index=date_range("2000-01-01", periods=10, freq="B"), |
|
) |
|
|
|
grouped = tsf.groupby(lambda x: x.month, group_keys=False) |
|
result = grouped.apply(lambda x: x.sort_values(by="A")[:3]) |
|
|
|
pieces = [group.sort_values(by="A")[:3] for key, group in grouped] |
|
|
|
expected = pd.concat(pieces) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False) |
|
result = grouped.apply(lambda x: x.sort_values()[:3]) |
|
|
|
pieces = [group.sort_values()[:3] for key, group in grouped] |
|
|
|
expected = pd.concat(pieces) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_no_nonsense_name(float_frame): |
|
|
|
s = float_frame["C"].copy() |
|
s.name = None |
|
|
|
result = s.groupby(float_frame["A"]).agg("sum") |
|
assert result.name is None |
|
|
|
|
|
def test_multifunc_sum_bug(): |
|
|
|
x = DataFrame(np.arange(9).reshape(3, 3)) |
|
x["test"] = 0 |
|
x["fl"] = [1.3, 1.5, 1.6] |
|
|
|
grouped = x.groupby("test") |
|
result = grouped.agg({"fl": "sum", 2: "size"}) |
|
assert result["fl"].dtype == np.float64 |
|
|
|
|
|
def test_handle_dict_return_value(df): |
|
def f(group): |
|
return {"max": group.max(), "min": group.min()} |
|
|
|
def g(group): |
|
return Series({"max": group.max(), "min": group.min()}) |
|
|
|
result = df.groupby("A")["C"].apply(f) |
|
expected = df.groupby("A")["C"].apply(g) |
|
|
|
assert isinstance(result, Series) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("grouper", ["A", ["A", "B"]]) |
|
def test_set_group_name(df, grouper, using_infer_string): |
|
def f(group): |
|
assert group.name is not None |
|
return group |
|
|
|
def freduce(group): |
|
assert group.name is not None |
|
if using_infer_string and grouper == "A" and is_string_dtype(group.dtype): |
|
with pytest.raises(TypeError, match="does not support"): |
|
group.sum() |
|
else: |
|
return group.sum() |
|
|
|
def freducex(x): |
|
return freduce(x) |
|
|
|
grouped = df.groupby(grouper, group_keys=False) |
|
|
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
grouped.apply(f) |
|
grouped.aggregate(freduce) |
|
grouped.aggregate({"C": freduce, "D": freduce}) |
|
grouped.transform(f) |
|
|
|
grouped["C"].apply(f) |
|
grouped["C"].aggregate(freduce) |
|
grouped["C"].aggregate([freduce, freducex]) |
|
grouped["C"].transform(f) |
|
|
|
|
|
def test_group_name_available_in_inference_pass(): |
|
|
|
df = DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)}) |
|
|
|
names = [] |
|
|
|
def f(group): |
|
names.append(group.name) |
|
return group.copy() |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
df.groupby("a", sort=False, group_keys=False).apply(f) |
|
|
|
expected_names = [0, 1, 2] |
|
assert names == expected_names |
|
|
|
|
|
def test_no_dummy_key_names(df): |
|
|
|
result = df.groupby(df["A"].values).sum() |
|
assert result.index.name is None |
|
|
|
result = df.groupby([df["A"].values, df["B"].values]).sum() |
|
assert result.index.names == (None, None) |
|
|
|
|
|
def test_groupby_sort_multiindex_series(): |
|
|
|
|
|
|
|
index = MultiIndex( |
|
levels=[[1, 2], [1, 2]], |
|
codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]], |
|
names=["a", "b"], |
|
) |
|
mseries = Series([0, 1, 2, 3, 4, 5], index=index) |
|
index = MultiIndex( |
|
levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"] |
|
) |
|
mseries_result = Series([0, 2, 4], index=index) |
|
|
|
result = mseries.groupby(level=["a", "b"], sort=False).first() |
|
tm.assert_series_equal(result, mseries_result) |
|
result = mseries.groupby(level=["a", "b"], sort=True).first() |
|
tm.assert_series_equal(result, mseries_result.sort_index()) |
|
|
|
|
|
def test_groupby_reindex_inside_function(): |
|
periods = 1000 |
|
ind = date_range(start="2012/1/1", freq="5min", periods=periods) |
|
df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind) |
|
|
|
def agg_before(func, fix=False): |
|
""" |
|
Run an aggregate func on the subset of data. |
|
""" |
|
|
|
def _func(data): |
|
d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna() |
|
if fix: |
|
data[data.index[0]] |
|
if len(d) == 0: |
|
return None |
|
return func(d) |
|
|
|
return _func |
|
|
|
grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day)) |
|
closure_bad = grouped.agg({"high": agg_before(np.max)}) |
|
closure_good = grouped.agg({"high": agg_before(np.max, True)}) |
|
|
|
tm.assert_frame_equal(closure_bad, closure_good) |
|
|
|
|
|
def test_groupby_multiindex_missing_pair(): |
|
|
|
df = DataFrame( |
|
{ |
|
"group1": ["a", "a", "a", "b"], |
|
"group2": ["c", "c", "d", "c"], |
|
"value": [1, 1, 1, 5], |
|
} |
|
) |
|
df = df.set_index(["group1", "group2"]) |
|
df_grouped = df.groupby(level=["group1", "group2"], sort=True) |
|
|
|
res = df_grouped.agg("sum") |
|
idx = MultiIndex.from_tuples( |
|
[("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"] |
|
) |
|
exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"]) |
|
|
|
tm.assert_frame_equal(res, exp) |
|
|
|
|
|
def test_groupby_multiindex_not_lexsorted(): |
|
|
|
|
|
|
|
lexsorted_mi = MultiIndex.from_tuples( |
|
[("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"] |
|
) |
|
lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi) |
|
assert lexsorted_df.columns._is_lexsorted() |
|
|
|
|
|
not_lexsorted_df = DataFrame( |
|
columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]] |
|
) |
|
not_lexsorted_df = not_lexsorted_df.pivot_table( |
|
index="a", columns=["b", "c"], values="d" |
|
) |
|
not_lexsorted_df = not_lexsorted_df.reset_index() |
|
assert not not_lexsorted_df.columns._is_lexsorted() |
|
|
|
expected = lexsorted_df.groupby("a").mean() |
|
with tm.assert_produces_warning(PerformanceWarning): |
|
result = not_lexsorted_df.groupby("a").mean() |
|
tm.assert_frame_equal(expected, result) |
|
|
|
|
|
|
|
df = DataFrame( |
|
{"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]} |
|
).set_index(["x", "y"]) |
|
assert not df.index._is_lexsorted() |
|
|
|
for level in [0, 1, [0, 1]]: |
|
for sort in [False, True]: |
|
result = df.groupby(level=level, sort=sort, group_keys=False).apply( |
|
DataFrame.drop_duplicates |
|
) |
|
expected = df |
|
tm.assert_frame_equal(expected, result) |
|
|
|
result = ( |
|
df.sort_index() |
|
.groupby(level=level, sort=sort, group_keys=False) |
|
.apply(DataFrame.drop_duplicates) |
|
) |
|
expected = df.sort_index() |
|
tm.assert_frame_equal(expected, result) |
|
|
|
|
|
def test_index_label_overlaps_location(): |
|
|
|
|
|
df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1]) |
|
g = df.groupby(list("ababb")) |
|
actual = g.filter(lambda x: len(x) > 2) |
|
expected = df.iloc[[1, 3, 4]] |
|
tm.assert_frame_equal(actual, expected) |
|
|
|
ser = df[0] |
|
g = ser.groupby(list("ababb")) |
|
actual = g.filter(lambda x: len(x) > 2) |
|
expected = ser.take([1, 3, 4]) |
|
tm.assert_series_equal(actual, expected) |
|
|
|
|
|
df.index = df.index.astype(float) |
|
g = df.groupby(list("ababb")) |
|
actual = g.filter(lambda x: len(x) > 2) |
|
expected = df.iloc[[1, 3, 4]] |
|
tm.assert_frame_equal(actual, expected) |
|
|
|
ser = df[0] |
|
g = ser.groupby(list("ababb")) |
|
actual = g.filter(lambda x: len(x) > 2) |
|
expected = ser.take([1, 3, 4]) |
|
tm.assert_series_equal(actual, expected) |
|
|
|
|
|
def test_transform_doesnt_clobber_ints(): |
|
|
|
n = 6 |
|
x = np.arange(n) |
|
df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x}) |
|
df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x}) |
|
|
|
gb = df.groupby("a") |
|
result = gb.transform("mean") |
|
|
|
gb2 = df2.groupby("a") |
|
expected = gb2.transform("mean") |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"sort_column", |
|
["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]], |
|
) |
|
@pytest.mark.parametrize( |
|
"group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]] |
|
) |
|
def test_groupby_preserves_sort(sort_column, group_column): |
|
|
|
|
|
|
|
df = DataFrame( |
|
{ |
|
"int_groups": [3, 1, 0, 1, 0, 3, 3, 3], |
|
"string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"], |
|
"ints": [8, 7, 4, 5, 2, 9, 1, 1], |
|
"floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5], |
|
"strings": ["z", "d", "a", "e", "word", "word2", "42", "47"], |
|
} |
|
) |
|
|
|
|
|
|
|
df = df.sort_values(by=sort_column) |
|
g = df.groupby(group_column) |
|
|
|
def test_sort(x): |
|
tm.assert_frame_equal(x, x.sort_values(by=sort_column)) |
|
|
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
g.apply(test_sort) |
|
|
|
|
|
def test_pivot_table_values_key_error(): |
|
|
|
df = DataFrame( |
|
{ |
|
"eventDate": date_range(datetime.today(), periods=20, freq="ME").tolist(), |
|
"thename": range(20), |
|
} |
|
) |
|
|
|
df["year"] = df.set_index("eventDate").index.year |
|
df["month"] = df.set_index("eventDate").index.month |
|
|
|
with pytest.raises(KeyError, match="'badname'"): |
|
df.reset_index().pivot_table( |
|
index="year", columns="month", values="badname", aggfunc="count" |
|
) |
|
|
|
|
|
@pytest.mark.parametrize("columns", ["C", ["C"]]) |
|
@pytest.mark.parametrize("keys", [["A"], ["A", "B"]]) |
|
@pytest.mark.parametrize( |
|
"values", |
|
[ |
|
[True], |
|
[0], |
|
[0.0], |
|
["a"], |
|
Categorical([0]), |
|
[to_datetime(0)], |
|
date_range(0, 1, 1, tz="US/Eastern"), |
|
pd.period_range("2016-01-01", periods=3, freq="D"), |
|
pd.array([0], dtype="Int64"), |
|
pd.array([0], dtype="Float64"), |
|
pd.array([False], dtype="boolean"), |
|
], |
|
ids=[ |
|
"bool", |
|
"int", |
|
"float", |
|
"str", |
|
"cat", |
|
"dt64", |
|
"dt64tz", |
|
"period", |
|
"Int64", |
|
"Float64", |
|
"boolean", |
|
], |
|
) |
|
@pytest.mark.parametrize("method", ["attr", "agg", "apply"]) |
|
@pytest.mark.parametrize( |
|
"op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"] |
|
) |
|
def test_empty_groupby( |
|
columns, keys, values, method, op, using_array_manager, dropna, using_infer_string |
|
): |
|
|
|
override_dtype = None |
|
|
|
if isinstance(values, BooleanArray) and op in ["sum", "prod"]: |
|
|
|
override_dtype = "Int64" |
|
|
|
if isinstance(values[0], bool) and op in ("prod", "sum"): |
|
|
|
override_dtype = "int64" |
|
|
|
df = DataFrame({"A": values, "B": values, "C": values}, columns=list("ABC")) |
|
|
|
if hasattr(values, "dtype"): |
|
|
|
assert (df.dtypes == values.dtype).all() |
|
|
|
df = df.iloc[:0] |
|
|
|
gb = df.groupby(keys, group_keys=False, dropna=dropna, observed=False)[columns] |
|
|
|
def get_result(**kwargs): |
|
if method == "attr": |
|
return getattr(gb, op)(**kwargs) |
|
else: |
|
return getattr(gb, method)(op, **kwargs) |
|
|
|
def get_categorical_invalid_expected(): |
|
|
|
|
|
|
|
|
|
lev = Categorical([0], dtype=values.dtype) |
|
if len(keys) != 1: |
|
idx = MultiIndex.from_product([lev, lev], names=keys) |
|
else: |
|
|
|
|
|
idx = Index(lev, name=keys[0]) |
|
|
|
if using_infer_string: |
|
columns = Index([], dtype="string[pyarrow_numpy]") |
|
else: |
|
columns = [] |
|
expected = DataFrame([], columns=columns, index=idx) |
|
return expected |
|
|
|
is_per = isinstance(df.dtypes.iloc[0], pd.PeriodDtype) |
|
is_dt64 = df.dtypes.iloc[0].kind == "M" |
|
is_cat = isinstance(values, Categorical) |
|
|
|
if ( |
|
isinstance(values, Categorical) |
|
and not values.ordered |
|
and op in ["min", "max", "idxmin", "idxmax"] |
|
): |
|
if op in ["min", "max"]: |
|
msg = f"Cannot perform {op} with non-ordered Categorical" |
|
klass = TypeError |
|
else: |
|
msg = f"Can't get {op} of an empty group due to unobserved categories" |
|
klass = ValueError |
|
with pytest.raises(klass, match=msg): |
|
get_result() |
|
|
|
if op in ["min", "max", "idxmin", "idxmax"] and isinstance(columns, list): |
|
|
|
result = get_result(numeric_only=True) |
|
expected = get_categorical_invalid_expected() |
|
tm.assert_equal(result, expected) |
|
return |
|
|
|
if op in ["prod", "sum", "skew"]: |
|
|
|
if is_dt64 or is_cat or is_per: |
|
|
|
|
|
if is_dt64: |
|
msg = "datetime64 type does not support" |
|
elif is_per: |
|
msg = "Period type does not support" |
|
else: |
|
msg = "category type does not support" |
|
if op == "skew": |
|
msg = "|".join([msg, "does not support reduction 'skew'"]) |
|
with pytest.raises(TypeError, match=msg): |
|
get_result() |
|
|
|
if not isinstance(columns, list): |
|
|
|
return |
|
elif op == "skew": |
|
|
|
return |
|
else: |
|
|
|
|
|
|
|
|
|
result = get_result(numeric_only=True) |
|
|
|
|
|
|
|
expected = df.set_index(keys)[[]] |
|
if is_cat: |
|
expected = get_categorical_invalid_expected() |
|
tm.assert_equal(result, expected) |
|
return |
|
|
|
result = get_result() |
|
expected = df.set_index(keys)[columns] |
|
if op in ["idxmax", "idxmin"]: |
|
expected = expected.astype(df.index.dtype) |
|
if override_dtype is not None: |
|
expected = expected.astype(override_dtype) |
|
if len(keys) == 1: |
|
expected.index.name = keys[0] |
|
tm.assert_equal(result, expected) |
|
|
|
|
|
def test_empty_groupby_apply_nonunique_columns(): |
|
|
|
df = DataFrame(np.random.default_rng(2).standard_normal((0, 4))) |
|
df[3] = df[3].astype(np.int64) |
|
df.columns = [0, 1, 2, 0] |
|
gb = df.groupby(df[1], group_keys=False) |
|
msg = "DataFrameGroupBy.apply operated on the grouping columns" |
|
with tm.assert_produces_warning(DeprecationWarning, match=msg): |
|
res = gb.apply(lambda x: x) |
|
assert (res.dtypes == df.dtypes).all() |
|
|
|
|
|
def test_tuple_as_grouping(): |
|
|
|
df = DataFrame( |
|
{ |
|
("a", "b"): [1, 1, 1, 1], |
|
"a": [2, 2, 2, 2], |
|
"b": [2, 2, 2, 2], |
|
"c": [1, 1, 1, 1], |
|
} |
|
) |
|
|
|
with pytest.raises(KeyError, match=r"('a', 'b')"): |
|
df[["a", "b", "c"]].groupby(("a", "b")) |
|
|
|
result = df.groupby(("a", "b"))["c"].sum() |
|
expected = Series([4], name="c", index=Index([1], name=("a", "b"))) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_tuple_correct_keyerror(): |
|
|
|
df = DataFrame(1, index=range(3), columns=MultiIndex.from_product([[1, 2], [3, 4]])) |
|
with pytest.raises(KeyError, match=r"^\(7, 8\)$"): |
|
df.groupby((7, 8)).mean() |
|
|
|
|
|
def test_groupby_agg_ohlc_non_first(): |
|
|
|
df = DataFrame( |
|
[[1], [1]], |
|
columns=Index(["foo"], name="mycols"), |
|
index=date_range("2018-01-01", periods=2, freq="D", name="dti"), |
|
) |
|
|
|
expected = DataFrame( |
|
[[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], |
|
columns=MultiIndex.from_tuples( |
|
( |
|
("foo", "sum", "foo"), |
|
("foo", "ohlc", "open"), |
|
("foo", "ohlc", "high"), |
|
("foo", "ohlc", "low"), |
|
("foo", "ohlc", "close"), |
|
), |
|
names=["mycols", None, None], |
|
), |
|
index=date_range("2018-01-01", periods=2, freq="D", name="dti"), |
|
) |
|
|
|
result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"]) |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_multiindex_nat(): |
|
|
|
values = [ |
|
(pd.NaT, "a"), |
|
(datetime(2012, 1, 2), "a"), |
|
(datetime(2012, 1, 2), "b"), |
|
(datetime(2012, 1, 3), "a"), |
|
] |
|
mi = MultiIndex.from_tuples(values, names=["date", None]) |
|
ser = Series([3, 2, 2.5, 4], index=mi) |
|
|
|
result = ser.groupby(level=1).mean() |
|
expected = Series([3.0, 2.5], index=["a", "b"]) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_groupby_empty_list_raises(): |
|
|
|
values = zip(range(10), range(10)) |
|
df = DataFrame(values, columns=["apple", "b"]) |
|
msg = "Grouper and axis must be same length" |
|
with pytest.raises(ValueError, match=msg): |
|
df.groupby([[]]) |
|
|
|
|
|
def test_groupby_multiindex_series_keys_len_equal_group_axis(): |
|
|
|
index_array = [["x", "x"], ["a", "b"], ["k", "k"]] |
|
index_names = ["first", "second", "third"] |
|
ri = MultiIndex.from_arrays(index_array, names=index_names) |
|
s = Series(data=[1, 2], index=ri) |
|
result = s.groupby(["first", "third"]).sum() |
|
|
|
index_array = [["x"], ["k"]] |
|
index_names = ["first", "third"] |
|
ei = MultiIndex.from_arrays(index_array, names=index_names) |
|
expected = Series([3], index=ei) |
|
|
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_groupby_groups_in_BaseGrouper(): |
|
|
|
|
|
mi = MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"]) |
|
df = DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi) |
|
result = df.groupby([Grouper(level="alpha"), "beta"]) |
|
expected = df.groupby(["alpha", "beta"]) |
|
assert result.groups == expected.groups |
|
|
|
result = df.groupby(["beta", Grouper(level="alpha")]) |
|
expected = df.groupby(["beta", "alpha"]) |
|
assert result.groups == expected.groups |
|
|
|
|
|
@pytest.mark.parametrize("group_name", ["x", ["x"]]) |
|
def test_groupby_axis_1(group_name): |
|
|
|
df = DataFrame( |
|
np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20] |
|
) |
|
df.index.name = "y" |
|
df.columns.name = "x" |
|
|
|
depr_msg = "DataFrame.groupby with axis=1 is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
gb = df.groupby(group_name, axis=1) |
|
|
|
results = gb.sum() |
|
expected = df.T.groupby(group_name).sum().T |
|
tm.assert_frame_equal(results, expected) |
|
|
|
|
|
iterables = [["bar", "baz", "foo"], ["one", "two"]] |
|
mi = MultiIndex.from_product(iterables=iterables, names=["x", "x1"]) |
|
df = DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi) |
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
gb = df.groupby(group_name, axis=1) |
|
results = gb.sum() |
|
expected = df.T.groupby(group_name).sum().T |
|
tm.assert_frame_equal(results, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"op, expected", |
|
[ |
|
( |
|
"shift", |
|
{ |
|
"time": [ |
|
None, |
|
None, |
|
Timestamp("2019-01-01 12:00:00"), |
|
Timestamp("2019-01-01 12:30:00"), |
|
None, |
|
None, |
|
] |
|
}, |
|
), |
|
( |
|
"bfill", |
|
{ |
|
"time": [ |
|
Timestamp("2019-01-01 12:00:00"), |
|
Timestamp("2019-01-01 12:30:00"), |
|
Timestamp("2019-01-01 14:00:00"), |
|
Timestamp("2019-01-01 14:30:00"), |
|
Timestamp("2019-01-01 14:00:00"), |
|
Timestamp("2019-01-01 14:30:00"), |
|
] |
|
}, |
|
), |
|
( |
|
"ffill", |
|
{ |
|
"time": [ |
|
Timestamp("2019-01-01 12:00:00"), |
|
Timestamp("2019-01-01 12:30:00"), |
|
Timestamp("2019-01-01 12:00:00"), |
|
Timestamp("2019-01-01 12:30:00"), |
|
Timestamp("2019-01-01 14:00:00"), |
|
Timestamp("2019-01-01 14:30:00"), |
|
] |
|
}, |
|
), |
|
], |
|
) |
|
def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected): |
|
|
|
tz = tz_naive_fixture |
|
data = { |
|
"id": ["A", "B", "A", "B", "A", "B"], |
|
"time": [ |
|
Timestamp("2019-01-01 12:00:00"), |
|
Timestamp("2019-01-01 12:30:00"), |
|
None, |
|
None, |
|
Timestamp("2019-01-01 14:00:00"), |
|
Timestamp("2019-01-01 14:30:00"), |
|
], |
|
} |
|
df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz)) |
|
|
|
grouped = df.groupby("id") |
|
result = getattr(grouped, op)() |
|
expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz)) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_only_none_group(): |
|
|
|
|
|
df = DataFrame({"g": [None], "x": 1}) |
|
actual = df.groupby("g")["x"].transform("sum") |
|
expected = Series([np.nan], name="x") |
|
|
|
tm.assert_series_equal(actual, expected) |
|
|
|
|
|
def test_groupby_duplicate_index(): |
|
|
|
ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0]) |
|
gb = ser.groupby(level=0) |
|
|
|
result = gb.mean() |
|
expected = Series([2, 5.5, 8], index=[2.0, 4.0, 5.0]) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_group_on_empty_multiindex(transformation_func, request): |
|
|
|
|
|
df = DataFrame( |
|
data=[[1, Timestamp("today"), 3, 4]], |
|
columns=["col_1", "col_2", "col_3", "col_4"], |
|
) |
|
df["col_3"] = df["col_3"].astype(int) |
|
df["col_4"] = df["col_4"].astype(int) |
|
df = df.set_index(["col_1", "col_2"]) |
|
if transformation_func == "fillna": |
|
args = ("ffill",) |
|
else: |
|
args = () |
|
warn = FutureWarning if transformation_func == "fillna" else None |
|
warn_msg = "DataFrameGroupBy.fillna is deprecated" |
|
with tm.assert_produces_warning(warn, match=warn_msg): |
|
result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args) |
|
with tm.assert_produces_warning(warn, match=warn_msg): |
|
expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0] |
|
if transformation_func in ("diff", "shift"): |
|
expected = expected.astype(int) |
|
tm.assert_equal(result, expected) |
|
|
|
warn_msg = "SeriesGroupBy.fillna is deprecated" |
|
with tm.assert_produces_warning(warn, match=warn_msg): |
|
result = ( |
|
df["col_3"] |
|
.iloc[:0] |
|
.groupby(["col_1"]) |
|
.transform(transformation_func, *args) |
|
) |
|
warn_msg = "SeriesGroupBy.fillna is deprecated" |
|
with tm.assert_produces_warning(warn, match=warn_msg): |
|
expected = ( |
|
df["col_3"] |
|
.groupby(["col_1"]) |
|
.transform(transformation_func, *args) |
|
.iloc[:0] |
|
) |
|
if transformation_func in ("diff", "shift"): |
|
expected = expected.astype(int) |
|
tm.assert_equal(result, expected) |
|
|
|
|
|
def test_groupby_crash_on_nunique(axis): |
|
|
|
dti = date_range("2016-01-01", periods=2, name="foo") |
|
df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]}) |
|
df.columns.names = ("bar", "baz") |
|
df.index = dti |
|
|
|
axis_number = df._get_axis_number(axis) |
|
if not axis_number: |
|
df = df.T |
|
msg = "The 'axis' keyword in DataFrame.groupby is deprecated" |
|
else: |
|
msg = "DataFrame.groupby with axis=1 is deprecated" |
|
|
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
gb = df.groupby(axis=axis_number, level=0) |
|
result = gb.nunique() |
|
|
|
expected = DataFrame({"A": [1, 2], "D": [1, 1]}, index=dti) |
|
expected.columns.name = "bar" |
|
if not axis_number: |
|
expected = expected.T |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
if axis_number == 0: |
|
|
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
gb2 = df[[]].groupby(axis=axis_number, level=0) |
|
exp = expected[[]] |
|
else: |
|
|
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
gb2 = df.loc[[]].groupby(axis=axis_number, level=0) |
|
|
|
exp = expected.loc[[]].astype(np.float64) |
|
|
|
res = gb2.nunique() |
|
tm.assert_frame_equal(res, exp) |
|
|
|
|
|
def test_groupby_list_level(): |
|
|
|
expected = DataFrame(np.arange(0, 9).reshape(3, 3), dtype=float) |
|
result = expected.groupby(level=[0]).mean() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"max_seq_items, expected", |
|
[ |
|
(5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"), |
|
(4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"), |
|
(1, "{0: [0], ...}"), |
|
], |
|
) |
|
def test_groups_repr_truncates(max_seq_items, expected): |
|
|
|
df = DataFrame(np.random.default_rng(2).standard_normal((5, 1))) |
|
df["a"] = df.index |
|
|
|
with pd.option_context("display.max_seq_items", max_seq_items): |
|
result = df.groupby("a").groups.__repr__() |
|
assert result == expected |
|
|
|
result = df.groupby(np.array(df.a)).groups.__repr__() |
|
assert result == expected |
|
|
|
|
|
def test_group_on_two_row_multiindex_returns_one_tuple_key(): |
|
|
|
df = DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}]) |
|
df = df.set_index(["a", "b"]) |
|
|
|
grp = df.groupby(["a", "b"]) |
|
result = grp.indices |
|
expected = {(1, 2): np.array([0, 1], dtype=np.int64)} |
|
|
|
assert len(result) == 1 |
|
key = (1, 2) |
|
assert (result[key] == expected[key]).all() |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"klass, attr, value", |
|
[ |
|
(DataFrame, "level", "a"), |
|
(DataFrame, "as_index", False), |
|
(DataFrame, "sort", False), |
|
(DataFrame, "group_keys", False), |
|
(DataFrame, "observed", True), |
|
(DataFrame, "dropna", False), |
|
(Series, "level", "a"), |
|
(Series, "as_index", False), |
|
(Series, "sort", False), |
|
(Series, "group_keys", False), |
|
(Series, "observed", True), |
|
(Series, "dropna", False), |
|
], |
|
) |
|
def test_subsetting_columns_keeps_attrs(klass, attr, value): |
|
|
|
df = DataFrame({"a": [1], "b": [2], "c": [3]}) |
|
if attr != "axis": |
|
df = df.set_index("a") |
|
|
|
expected = df.groupby("a", **{attr: value}) |
|
result = expected[["b"]] if klass is DataFrame else expected["b"] |
|
assert getattr(result, attr) == getattr(expected, attr) |
|
|
|
|
|
def test_subsetting_columns_axis_1(): |
|
|
|
df = DataFrame({"A": [1], "B": [2], "C": [3]}) |
|
msg = "DataFrame.groupby with axis=1 is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
g = df.groupby([0, 0, 1], axis=1) |
|
match = "Cannot subset columns when using axis=1" |
|
with pytest.raises(ValueError, match=match): |
|
g[["A", "B"]].sum() |
|
|
|
|
|
@pytest.mark.parametrize("func", ["sum", "any", "shift"]) |
|
def test_groupby_column_index_name_lost(func): |
|
|
|
expected = Index(["a"], name="idx") |
|
df = DataFrame([[1]], columns=expected) |
|
df_grouped = df.groupby([1]) |
|
result = getattr(df_grouped, func)().columns |
|
tm.assert_index_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"infer_string", |
|
[ |
|
False, |
|
pytest.param(True, marks=td.skip_if_no("pyarrow")), |
|
], |
|
) |
|
def test_groupby_duplicate_columns(infer_string): |
|
|
|
if infer_string: |
|
pytest.importorskip("pyarrow") |
|
df = DataFrame( |
|
{"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]} |
|
).astype(object) |
|
df.columns = ["A", "B", "B"] |
|
with pd.option_context("future.infer_string", infer_string): |
|
result = df.groupby([0, 0, 0, 0]).min() |
|
expected = DataFrame( |
|
[["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_series_with_tuple_name(): |
|
|
|
ser = Series([1, 2, 3, 4], index=[1, 1, 2, 2], name=("a", "a")) |
|
ser.index.name = ("b", "b") |
|
result = ser.groupby(level=0).last() |
|
expected = Series([2, 4], index=[1, 2], name=("a", "a")) |
|
expected.index.name = ("b", "b") |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])] |
|
) |
|
def test_groupby_numerical_stability_sum_mean(func, values): |
|
|
|
data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] |
|
df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) |
|
result = getattr(df.groupby("group"), func)() |
|
expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group")) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_numerical_stability_cumsum(): |
|
|
|
data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15] |
|
df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data}) |
|
result = df.groupby("group").cumsum() |
|
exp_data = ( |
|
[1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0] |
|
) |
|
expected = DataFrame({"a": exp_data, "b": exp_data}) |
|
tm.assert_frame_equal(result, expected, check_exact=True) |
|
|
|
|
|
def test_groupby_cumsum_skipna_false(): |
|
|
|
arr = np.random.default_rng(2).standard_normal((5, 5)) |
|
df = DataFrame(arr) |
|
for i in range(5): |
|
df.iloc[i, i] = np.nan |
|
|
|
df["A"] = 1 |
|
gb = df.groupby("A") |
|
|
|
res = gb.cumsum(skipna=False) |
|
|
|
expected = df[[0, 1, 2, 3, 4]].cumsum(skipna=False) |
|
tm.assert_frame_equal(res, expected) |
|
|
|
|
|
def test_groupby_cumsum_timedelta64(): |
|
|
|
dti = date_range("2016-01-01", periods=5) |
|
ser = Series(dti) - dti[0] |
|
ser[2] = pd.NaT |
|
|
|
df = DataFrame({"A": 1, "B": ser}) |
|
gb = df.groupby("A") |
|
|
|
res = gb.cumsum(numeric_only=False, skipna=True) |
|
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, ser[4], ser[4] * 2]}) |
|
tm.assert_frame_equal(res, exp) |
|
|
|
res = gb.cumsum(numeric_only=False, skipna=False) |
|
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, pd.NaT, pd.NaT]}) |
|
tm.assert_frame_equal(res, exp) |
|
|
|
|
|
def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex): |
|
dups = rand_series_with_duplicate_datetimeindex |
|
result = dups.groupby(level=0).mean() |
|
expected = dups.groupby(dups.index).mean() |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_groupby_all_nan_groups_drop(): |
|
|
|
s = Series([1, 2, 3], [np.nan, np.nan, np.nan]) |
|
result = s.groupby(s.index).sum() |
|
expected = Series([], index=Index([], dtype=np.float64), dtype=np.int64) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("numeric_only", [True, False]) |
|
def test_groupby_empty_multi_column(as_index, numeric_only): |
|
|
|
df = DataFrame(data=[], columns=["A", "B", "C"]) |
|
gb = df.groupby(["A", "B"], as_index=as_index) |
|
result = gb.sum(numeric_only=numeric_only) |
|
if as_index: |
|
index = MultiIndex([[], []], [[], []], names=["A", "B"]) |
|
columns = ["C"] if not numeric_only else [] |
|
else: |
|
index = RangeIndex(0) |
|
columns = ["A", "B", "C"] if not numeric_only else ["A", "B"] |
|
expected = DataFrame([], columns=columns, index=index) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_aggregation_non_numeric_dtype(): |
|
|
|
df = DataFrame( |
|
[["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"] |
|
) |
|
|
|
expected = DataFrame( |
|
{ |
|
"v": [[1, 1], [10, 20]], |
|
}, |
|
index=Index(["M", "W"], dtype="object", name="MW"), |
|
) |
|
|
|
gb = df.groupby(by=["MW"]) |
|
result = gb.sum() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_aggregation_multi_non_numeric_dtype(): |
|
|
|
df = DataFrame( |
|
{ |
|
"x": [1, 0, 1, 1, 0], |
|
"y": [Timedelta(i, "days") for i in range(1, 6)], |
|
"z": [Timedelta(i * 10, "days") for i in range(1, 6)], |
|
} |
|
) |
|
|
|
expected = DataFrame( |
|
{ |
|
"y": [Timedelta(i, "days") for i in range(7, 9)], |
|
"z": [Timedelta(i * 10, "days") for i in range(7, 9)], |
|
}, |
|
index=Index([0, 1], dtype="int64", name="x"), |
|
) |
|
|
|
gb = df.groupby(by=["x"]) |
|
result = gb.sum() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_aggregation_numeric_with_non_numeric_dtype(): |
|
|
|
df = DataFrame( |
|
{ |
|
"x": [1, 0, 1, 1, 0], |
|
"y": [Timedelta(i, "days") for i in range(1, 6)], |
|
"z": list(range(1, 6)), |
|
} |
|
) |
|
|
|
expected = DataFrame( |
|
{"y": [Timedelta(7, "days"), Timedelta(8, "days")], "z": [7, 8]}, |
|
index=Index([0, 1], dtype="int64", name="x"), |
|
) |
|
|
|
gb = df.groupby(by=["x"]) |
|
result = gb.sum() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_filtered_df_std(): |
|
|
|
dicts = [ |
|
{"filter_col": False, "groupby_col": True, "bool_col": True, "float_col": 10.5}, |
|
{"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 20.5}, |
|
{"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 30.5}, |
|
] |
|
df = DataFrame(dicts) |
|
|
|
df_filter = df[df["filter_col"] == True] |
|
dfgb = df_filter.groupby("groupby_col") |
|
result = dfgb.std() |
|
expected = DataFrame( |
|
[[0.0, 0.0, 7.071068]], |
|
columns=["filter_col", "bool_col", "float_col"], |
|
index=Index([True], name="groupby_col"), |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_datetime_categorical_multikey_groupby_indices(): |
|
|
|
df = DataFrame( |
|
{ |
|
"a": Series(list("abc")), |
|
"b": Series( |
|
to_datetime(["2018-01-01", "2018-02-01", "2018-03-01"]), |
|
dtype="category", |
|
), |
|
"c": Categorical.from_codes([-1, 0, 1], categories=[0, 1]), |
|
} |
|
) |
|
result = df.groupby(["a", "b"], observed=False).indices |
|
expected = { |
|
("a", Timestamp("2018-01-01 00:00:00")): np.array([0]), |
|
("b", Timestamp("2018-02-01 00:00:00")): np.array([1]), |
|
("c", Timestamp("2018-03-01 00:00:00")): np.array([2]), |
|
} |
|
assert result == expected |
|
|
|
|
|
def test_rolling_wrong_param_min_period(): |
|
|
|
name_l = ["Alice"] * 5 + ["Bob"] * 5 |
|
val_l = [np.nan, np.nan, 1, 2, 3] + [np.nan, 1, 2, 3, 4] |
|
test_df = DataFrame([name_l, val_l]).T |
|
test_df.columns = ["name", "val"] |
|
|
|
result_error_msg = ( |
|
r"^[a-zA-Z._]*\(\) got an unexpected keyword argument 'min_period'" |
|
) |
|
with pytest.raises(TypeError, match=result_error_msg): |
|
test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum() |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"dtype", |
|
[ |
|
object, |
|
pytest.param("string[pyarrow_numpy]", marks=td.skip_if_no("pyarrow")), |
|
], |
|
) |
|
def test_by_column_values_with_same_starting_value(dtype): |
|
|
|
df = DataFrame( |
|
{ |
|
"Name": ["Thomas", "Thomas", "Thomas John"], |
|
"Credit": [1200, 1300, 900], |
|
"Mood": Series(["sad", "happy", "happy"], dtype=dtype), |
|
} |
|
) |
|
aggregate_details = {"Mood": Series.mode, "Credit": "sum"} |
|
|
|
result = df.groupby(["Name"]).agg(aggregate_details) |
|
expected_result = DataFrame( |
|
{ |
|
"Mood": [["happy", "sad"], "happy"], |
|
"Credit": [2500, 900], |
|
"Name": ["Thomas", "Thomas John"], |
|
} |
|
).set_index("Name") |
|
|
|
tm.assert_frame_equal(result, expected_result) |
|
|
|
|
|
def test_groupby_none_in_first_mi_level(): |
|
|
|
arr = [[None, 1, 0, 1], [2, 3, 2, 3]] |
|
ser = Series(1, index=MultiIndex.from_arrays(arr, names=["a", "b"])) |
|
result = ser.groupby(level=[0, 1]).sum() |
|
expected = Series( |
|
[1, 2], MultiIndex.from_tuples([(0.0, 2), (1.0, 3)], names=["a", "b"]) |
|
) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_groupby_none_column_name(): |
|
|
|
df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]}) |
|
result = df.groupby(by=[None]).sum() |
|
expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None)) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("selection", [None, "a", ["a"]]) |
|
def test_single_element_list_grouping(selection): |
|
|
|
df = DataFrame({"a": [1, 2], "b": [np.nan, 5], "c": [np.nan, 2]}, index=["x", "y"]) |
|
grouped = df.groupby(["a"]) if selection is None else df.groupby(["a"])[selection] |
|
result = [key for key, _ in grouped] |
|
|
|
expected = [(1,), (2,)] |
|
assert result == expected |
|
|
|
|
|
def test_groupby_string_dtype(): |
|
|
|
df = DataFrame({"str_col": ["a", "b", "c", "a"], "num_col": [1, 2, 3, 2]}) |
|
df["str_col"] = df["str_col"].astype("string") |
|
expected = DataFrame( |
|
{ |
|
"str_col": [ |
|
"a", |
|
"b", |
|
"c", |
|
], |
|
"num_col": [1.5, 2.0, 3.0], |
|
} |
|
) |
|
expected["str_col"] = expected["str_col"].astype("string") |
|
grouped = df.groupby("str_col", as_index=False) |
|
result = grouped.mean() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"level_arg, multiindex", [([0], False), ((0,), False), ([0], True), ((0,), True)] |
|
) |
|
def test_single_element_listlike_level_grouping_deprecation(level_arg, multiindex): |
|
|
|
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"]) |
|
if multiindex: |
|
df = df.set_index(["a", "b"]) |
|
depr_msg = ( |
|
"Creating a Groupby object with a length-1 list-like " |
|
"level parameter will yield indexes as tuples in a future version. " |
|
"To keep indexes as scalars, create Groupby objects with " |
|
"a scalar level parameter instead." |
|
) |
|
with tm.assert_produces_warning(FutureWarning, match=depr_msg): |
|
[key for key, _ in df.groupby(level=level_arg)] |
|
|
|
|
|
@pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"]) |
|
def test_groupby_avoid_casting_to_float(func): |
|
|
|
val = 922337203685477580 |
|
df = DataFrame({"a": 1, "b": [val]}) |
|
result = getattr(df.groupby("a"), func)() - val |
|
expected = DataFrame({"b": [0]}, index=Index([1], name="a")) |
|
if func in ["cumsum", "cumprod"]: |
|
expected = expected.reset_index(drop=True) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("func, val", [("sum", 3), ("prod", 2)]) |
|
def test_groupby_sum_support_mask(any_numeric_ea_dtype, func, val): |
|
|
|
df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype) |
|
result = getattr(df.groupby("a"), func)() |
|
expected = DataFrame( |
|
{"b": [val]}, |
|
index=Index([1], name="a", dtype=any_numeric_ea_dtype), |
|
dtype=any_numeric_ea_dtype, |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("val, dtype", [(111, "int"), (222, "uint")]) |
|
def test_groupby_overflow(val, dtype): |
|
|
|
df = DataFrame({"a": 1, "b": [val, val]}, dtype=f"{dtype}8") |
|
result = df.groupby("a").sum() |
|
expected = DataFrame( |
|
{"b": [val * 2]}, |
|
index=Index([1], name="a", dtype=f"{dtype}8"), |
|
dtype=f"{dtype}64", |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = df.groupby("a").cumsum() |
|
expected = DataFrame({"b": [val, val * 2]}, dtype=f"{dtype}64") |
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = df.groupby("a").prod() |
|
expected = DataFrame( |
|
{"b": [val * val]}, |
|
index=Index([1], name="a", dtype=f"{dtype}8"), |
|
dtype=f"{dtype}64", |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("skipna, val", [(True, 3), (False, pd.NA)]) |
|
def test_groupby_cumsum_mask(any_numeric_ea_dtype, skipna, val): |
|
|
|
df = DataFrame({"a": 1, "b": [1, pd.NA, 2]}, dtype=any_numeric_ea_dtype) |
|
result = df.groupby("a").cumsum(skipna=skipna) |
|
expected = DataFrame( |
|
{"b": [1, pd.NA, val]}, |
|
dtype=any_numeric_ea_dtype, |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"val_in, index, val_out", |
|
[ |
|
( |
|
[1.0, 2.0, 3.0, 4.0, 5.0], |
|
["foo", "foo", "bar", "baz", "blah"], |
|
[3.0, 4.0, 5.0, 3.0], |
|
), |
|
( |
|
[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], |
|
["foo", "foo", "bar", "baz", "blah", "blah"], |
|
[3.0, 4.0, 11.0, 3.0], |
|
), |
|
], |
|
) |
|
def test_groupby_index_name_in_index_content(val_in, index, val_out): |
|
|
|
series = Series(data=val_in, name="values", index=Index(index, name="blah")) |
|
result = series.groupby("blah").sum() |
|
expected = Series( |
|
data=val_out, |
|
name="values", |
|
index=Index(["bar", "baz", "blah", "foo"], name="blah"), |
|
) |
|
tm.assert_series_equal(result, expected) |
|
|
|
result = series.to_frame().groupby("blah").sum() |
|
expected = expected.to_frame() |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("n", [1, 10, 32, 100, 1000]) |
|
def test_sum_of_booleans(n): |
|
|
|
df = DataFrame({"groupby_col": 1, "bool": [True] * n}) |
|
df["bool"] = df["bool"].eq(True) |
|
result = df.groupby("groupby_col").sum() |
|
expected = DataFrame({"bool": [n]}, index=Index([1], name="groupby_col")) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.filterwarnings( |
|
"ignore:invalid value encountered in remainder:RuntimeWarning" |
|
) |
|
@pytest.mark.parametrize("method", ["head", "tail", "nth", "first", "last"]) |
|
def test_groupby_method_drop_na(method): |
|
|
|
df = DataFrame({"A": ["a", np.nan, "b", np.nan, "c"], "B": range(5)}) |
|
|
|
if method == "nth": |
|
result = getattr(df.groupby("A"), method)(n=0) |
|
else: |
|
result = getattr(df.groupby("A"), method)() |
|
|
|
if method in ["first", "last"]: |
|
expected = DataFrame({"B": [0, 2, 4]}).set_index( |
|
Series(["a", "b", "c"], name="A") |
|
) |
|
else: |
|
expected = DataFrame({"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=[0, 2, 4]) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_reduce_period(): |
|
|
|
pi = pd.period_range("2016-01-01", periods=100, freq="D") |
|
grps = list(range(10)) * 10 |
|
ser = pi.to_series() |
|
gb = ser.groupby(grps) |
|
|
|
with pytest.raises(TypeError, match="Period type does not support sum operations"): |
|
gb.sum() |
|
with pytest.raises( |
|
TypeError, match="Period type does not support cumsum operations" |
|
): |
|
gb.cumsum() |
|
with pytest.raises(TypeError, match="Period type does not support prod operations"): |
|
gb.prod() |
|
with pytest.raises( |
|
TypeError, match="Period type does not support cumprod operations" |
|
): |
|
gb.cumprod() |
|
|
|
res = gb.max() |
|
expected = ser[-10:] |
|
expected.index = Index(range(10), dtype=int) |
|
tm.assert_series_equal(res, expected) |
|
|
|
res = gb.min() |
|
expected = ser[:10] |
|
expected.index = Index(range(10), dtype=int) |
|
tm.assert_series_equal(res, expected) |
|
|
|
|
|
def test_obj_with_exclusions_duplicate_columns(): |
|
|
|
df = DataFrame([[0, 1, 2, 3]]) |
|
df.columns = [0, 1, 2, 0] |
|
gb = df.groupby(df[1]) |
|
result = gb._obj_with_exclusions |
|
expected = df.take([0, 2, 3], axis=1) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("numeric_only", [True, False]) |
|
def test_groupby_numeric_only_std_no_result(numeric_only): |
|
|
|
dicts_non_numeric = [{"a": "foo", "b": "bar"}, {"a": "car", "b": "dar"}] |
|
df = DataFrame(dicts_non_numeric) |
|
dfgb = df.groupby("a", as_index=False, sort=False) |
|
|
|
if numeric_only: |
|
result = dfgb.std(numeric_only=True) |
|
expected_df = DataFrame(["foo", "car"], columns=["a"]) |
|
tm.assert_frame_equal(result, expected_df) |
|
else: |
|
with pytest.raises( |
|
ValueError, match="could not convert string to float: 'bar'" |
|
): |
|
dfgb.std(numeric_only=numeric_only) |
|
|
|
|
|
def test_grouping_with_categorical_interval_columns(): |
|
|
|
df = DataFrame({"x": [0.1, 0.2, 0.3, -0.4, 0.5], "w": ["a", "b", "a", "c", "a"]}) |
|
qq = pd.qcut(df["x"], q=np.linspace(0, 1, 5)) |
|
result = df.groupby([qq, "w"], observed=False)["x"].agg("mean") |
|
categorical_index_level_1 = Categorical( |
|
[ |
|
Interval(-0.401, 0.1, closed="right"), |
|
Interval(0.1, 0.2, closed="right"), |
|
Interval(0.2, 0.3, closed="right"), |
|
Interval(0.3, 0.5, closed="right"), |
|
], |
|
ordered=True, |
|
) |
|
index_level_2 = ["a", "b", "c"] |
|
mi = MultiIndex.from_product( |
|
[categorical_index_level_1, index_level_2], names=["x", "w"] |
|
) |
|
expected = Series( |
|
np.array( |
|
[ |
|
0.1, |
|
np.nan, |
|
-0.4, |
|
np.nan, |
|
0.2, |
|
np.nan, |
|
0.3, |
|
np.nan, |
|
np.nan, |
|
0.5, |
|
np.nan, |
|
np.nan, |
|
] |
|
), |
|
index=mi, |
|
name="x", |
|
) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("bug_var", [1, "a"]) |
|
def test_groupby_sum_on_nan_should_return_nan(bug_var): |
|
|
|
df = DataFrame({"A": [bug_var, bug_var, bug_var, np.nan]}) |
|
dfgb = df.groupby(lambda x: x) |
|
result = dfgb.sum(min_count=1) |
|
|
|
expected_df = DataFrame([bug_var, bug_var, bug_var, None], columns=["A"]) |
|
tm.assert_frame_equal(result, expected_df) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"method", |
|
[ |
|
"count", |
|
"corr", |
|
"cummax", |
|
"cummin", |
|
"cumprod", |
|
"describe", |
|
"rank", |
|
"quantile", |
|
"diff", |
|
"shift", |
|
"all", |
|
"any", |
|
"idxmin", |
|
"idxmax", |
|
"ffill", |
|
"bfill", |
|
"pct_change", |
|
], |
|
) |
|
def test_groupby_selection_with_methods(df, method): |
|
|
|
rng = date_range("2014", periods=len(df)) |
|
df.index = rng |
|
|
|
g = df.groupby(["A"])[["C"]] |
|
g_exp = df[["C"]].groupby(df["A"]) |
|
|
|
|
|
res = getattr(g, method)() |
|
exp = getattr(g_exp, method)() |
|
|
|
|
|
tm.assert_frame_equal(res, exp) |
|
|
|
|
|
def test_groupby_selection_other_methods(df): |
|
|
|
rng = date_range("2014", periods=len(df)) |
|
df.columns.name = "foo" |
|
df.index = rng |
|
|
|
g = df.groupby(["A"])[["C"]] |
|
g_exp = df[["C"]].groupby(df["A"]) |
|
|
|
|
|
warn_msg = "DataFrameGroupBy.fillna is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=warn_msg): |
|
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0)) |
|
msg = "DataFrameGroupBy.dtypes is deprecated" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
tm.assert_frame_equal(g.dtypes, g_exp.dtypes) |
|
tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum())) |
|
|
|
tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean()) |
|
tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc()) |
|
|
|
tm.assert_frame_equal( |
|
g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3) |
|
) |
|
|
|
|
|
def test_groupby_with_Time_Grouper(unit): |
|
idx2 = to_datetime( |
|
[ |
|
"2016-08-31 22:08:12.000", |
|
"2016-08-31 22:09:12.200", |
|
"2016-08-31 22:20:12.400", |
|
] |
|
).as_unit(unit) |
|
|
|
test_data = DataFrame( |
|
{"quant": [1.0, 1.0, 3.0], "quant2": [1.0, 1.0, 3.0], "time2": idx2} |
|
) |
|
|
|
time2 = date_range("2016-08-31 22:08:00", periods=13, freq="1min", unit=unit) |
|
expected_output = DataFrame( |
|
{ |
|
"time2": time2, |
|
"quant": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], |
|
"quant2": [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], |
|
} |
|
) |
|
|
|
gb = test_data.groupby(Grouper(key="time2", freq="1min")) |
|
result = gb.count().reset_index() |
|
|
|
tm.assert_frame_equal(result, expected_output) |
|
|
|
|
|
def test_groupby_series_with_datetimeindex_month_name(): |
|
|
|
s = Series([0, 1, 0], index=date_range("2022-01-01", periods=3), name="jan") |
|
result = s.groupby(s).count() |
|
expected = Series([2, 1], name="jan") |
|
expected.index.name = "jan" |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("test_series", [True, False]) |
|
@pytest.mark.parametrize( |
|
"kwarg, value, name, warn", |
|
[ |
|
("by", "a", 1, None), |
|
("by", ["a"], 1, FutureWarning), |
|
("by", ["a"], (1,), None), |
|
("level", 0, 1, None), |
|
("level", [0], 1, FutureWarning), |
|
("level", [0], (1,), None), |
|
], |
|
) |
|
def test_depr_get_group_len_1_list_likes(test_series, kwarg, value, name, warn): |
|
|
|
obj = DataFrame({"b": [3, 4, 5]}, index=Index([1, 1, 2], name="a")) |
|
if test_series: |
|
obj = obj["b"] |
|
gb = obj.groupby(**{kwarg: value}) |
|
msg = "you will need to pass a length-1 tuple" |
|
with tm.assert_produces_warning(warn, match=msg): |
|
result = gb.get_group(name) |
|
if test_series: |
|
expected = Series([3, 4], index=Index([1, 1], name="a"), name="b") |
|
else: |
|
expected = DataFrame({"b": [3, 4]}, index=Index([1, 1], name="a")) |
|
tm.assert_equal(result, expected) |
|
|
|
|
|
def test_groupby_ngroup_with_nan(): |
|
|
|
df = DataFrame({"a": Categorical([np.nan]), "b": [1]}) |
|
result = df.groupby(["a", "b"], dropna=False, observed=False).ngroup() |
|
expected = Series([0]) |
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
def test_get_group_axis_1(): |
|
|
|
df = DataFrame( |
|
{ |
|
"col1": [0, 3, 2, 3], |
|
"col2": [4, 1, 6, 7], |
|
"col3": [3, 8, 2, 10], |
|
"col4": [1, 13, 6, 15], |
|
"col5": [-4, 5, 6, -7], |
|
} |
|
) |
|
with tm.assert_produces_warning(FutureWarning, match="deprecated"): |
|
grouped = df.groupby(axis=1, by=[1, 2, 3, 2, 1]) |
|
result = grouped.get_group(1) |
|
expected = DataFrame( |
|
{ |
|
"col1": [0, 3, 2, 3], |
|
"col5": [-4, 5, 6, -7], |
|
} |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_ffill_with_duplicated_index(): |
|
|
|
df = DataFrame({"a": [1, 2, 3, 4, np.nan, np.nan]}, index=[0, 1, 2, 0, 1, 2]) |
|
|
|
result = df.groupby(level=0).ffill() |
|
expected = DataFrame({"a": [1, 2, 3, 4, 2, 3]}, index=[0, 1, 2, 0, 1, 2]) |
|
tm.assert_frame_equal(result, expected, check_dtype=False) |
|
|
|
|
|
@pytest.mark.parametrize("test_series", [True, False]) |
|
def test_decimal_na_sort(test_series): |
|
|
|
|
|
|
|
assert not isinstance(decimal.InvalidOperation, TypeError) |
|
df = DataFrame( |
|
{ |
|
"key": [Decimal(1), Decimal(1), None, None], |
|
"value": [Decimal(2), Decimal(3), Decimal(4), Decimal(5)], |
|
} |
|
) |
|
gb = df.groupby("key", dropna=False) |
|
if test_series: |
|
gb = gb["value"] |
|
result = gb._grouper.result_index |
|
expected = Index([Decimal(1), None], name="key") |
|
tm.assert_index_equal(result, expected) |
|
|