|
|
|
|
|
import numpy as np |
|
import pytest |
|
|
|
import pandas as pd |
|
import pandas._testing as tm |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"arg, expected_rows", |
|
[ |
|
[0, [0, 1, 4]], |
|
[2, [5]], |
|
[5, []], |
|
[-1, [3, 4, 7]], |
|
[-2, [1, 6]], |
|
[-6, []], |
|
], |
|
) |
|
def test_int(slice_test_df, slice_test_grouped, arg, expected_rows): |
|
|
|
result = slice_test_grouped._positional_selector[arg] |
|
expected = slice_test_df.iloc[expected_rows] |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_slice(slice_test_df, slice_test_grouped): |
|
|
|
result = slice_test_grouped._positional_selector[0:3:2] |
|
expected = slice_test_df.iloc[[0, 1, 4, 5]] |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"arg, expected_rows", |
|
[ |
|
[[0, 2], [0, 1, 4, 5]], |
|
[[0, 2, -1], [0, 1, 3, 4, 5, 7]], |
|
[range(0, 3, 2), [0, 1, 4, 5]], |
|
[{0, 2}, [0, 1, 4, 5]], |
|
], |
|
ids=[ |
|
"list", |
|
"negative", |
|
"range", |
|
"set", |
|
], |
|
) |
|
def test_list(slice_test_df, slice_test_grouped, arg, expected_rows): |
|
|
|
result = slice_test_grouped._positional_selector[arg] |
|
expected = slice_test_df.iloc[expected_rows] |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_ints(slice_test_df, slice_test_grouped): |
|
|
|
result = slice_test_grouped._positional_selector[0, 2, -1] |
|
expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]] |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_slices(slice_test_df, slice_test_grouped): |
|
|
|
result = slice_test_grouped._positional_selector[:2, -2:] |
|
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_mix(slice_test_df, slice_test_grouped): |
|
|
|
result = slice_test_grouped._positional_selector[0, 1, -2:] |
|
expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]] |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"arg, expected_rows", |
|
[ |
|
[0, [0, 1, 4]], |
|
[[0, 2, -1], [0, 1, 3, 4, 5, 7]], |
|
[(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]], |
|
], |
|
) |
|
def test_as_index(slice_test_df, arg, expected_rows): |
|
|
|
result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg] |
|
expected = slice_test_df.iloc[expected_rows] |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_doc_examples(): |
|
|
|
df = pd.DataFrame( |
|
[["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"] |
|
) |
|
|
|
grouped = df.groupby("A", as_index=False) |
|
|
|
result = grouped._positional_selector[1:2] |
|
expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4]) |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
result = grouped._positional_selector[1, -1] |
|
expected = pd.DataFrame( |
|
[["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4] |
|
) |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.fixture() |
|
def multiindex_data(): |
|
rng = np.random.default_rng(2) |
|
ndates = 100 |
|
nitems = 20 |
|
dates = pd.date_range("20130101", periods=ndates, freq="D") |
|
items = [f"item {i}" for i in range(nitems)] |
|
|
|
data = {} |
|
for date in dates: |
|
nitems_for_date = nitems - rng.integers(0, 12) |
|
levels = [ |
|
(item, rng.integers(0, 10000) / 100, rng.integers(0, 10000) / 100) |
|
for item in items[:nitems_for_date] |
|
] |
|
levels.sort(key=lambda x: x[1]) |
|
data[date] = levels |
|
|
|
return data |
|
|
|
|
|
def _make_df_from_data(data): |
|
rows = {} |
|
for date in data: |
|
for level in data[date]: |
|
rows[(date, level[0])] = {"A": level[1], "B": level[2]} |
|
|
|
df = pd.DataFrame.from_dict(rows, orient="index") |
|
df.index.names = ("Date", "Item") |
|
return df |
|
|
|
|
|
def test_multiindex(multiindex_data): |
|
|
|
df = _make_df_from_data(multiindex_data) |
|
result = df.groupby("Date", as_index=False).nth(slice(3, -3)) |
|
|
|
sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data} |
|
expected = _make_df_from_data(sliced) |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000]) |
|
@pytest.mark.parametrize("method", ["head", "tail"]) |
|
@pytest.mark.parametrize("simulated", [True, False]) |
|
def test_against_head_and_tail(arg, method, simulated): |
|
|
|
n_groups = 100 |
|
n_rows_per_group = 30 |
|
|
|
data = { |
|
"group": [ |
|
f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups) |
|
], |
|
"value": [ |
|
f"group {g} row {j}" |
|
for j in range(n_rows_per_group) |
|
for g in range(n_groups) |
|
], |
|
} |
|
df = pd.DataFrame(data) |
|
grouped = df.groupby("group", as_index=False) |
|
size = arg if arg >= 0 else n_rows_per_group + arg |
|
|
|
if method == "head": |
|
result = grouped._positional_selector[:arg] |
|
|
|
if simulated: |
|
indices = [ |
|
j * n_groups + i |
|
for j in range(size) |
|
for i in range(n_groups) |
|
if j * n_groups + i < n_groups * n_rows_per_group |
|
] |
|
expected = df.iloc[indices] |
|
|
|
else: |
|
expected = grouped.head(arg) |
|
|
|
else: |
|
result = grouped._positional_selector[-arg:] |
|
|
|
if simulated: |
|
indices = [ |
|
(n_rows_per_group + j - size) * n_groups + i |
|
for j in range(size) |
|
for i in range(n_groups) |
|
if (n_rows_per_group + j - size) * n_groups + i >= 0 |
|
] |
|
expected = df.iloc[indices] |
|
|
|
else: |
|
expected = grouped.tail(arg) |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10]) |
|
@pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10]) |
|
@pytest.mark.parametrize("step", [None, 1, 5]) |
|
def test_against_df_iloc(start, stop, step): |
|
|
|
n_rows = 30 |
|
|
|
data = { |
|
"group": ["group 0"] * n_rows, |
|
"value": list(range(n_rows)), |
|
} |
|
df = pd.DataFrame(data) |
|
grouped = df.groupby("group", as_index=False) |
|
|
|
result = grouped._positional_selector[start:stop:step] |
|
expected = df.iloc[start:stop:step] |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_series(): |
|
|
|
ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"]) |
|
grouped = ser.groupby(level=0) |
|
result = grouped._positional_selector[1:2] |
|
expected = pd.Series([2, 5], index=["a", "b"]) |
|
|
|
tm.assert_series_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("step", [1, 2, 3, 4, 5]) |
|
def test_step(step): |
|
|
|
data = [["x", f"x{i}"] for i in range(5)] |
|
data += [["y", f"y{i}"] for i in range(4)] |
|
data += [["z", f"z{i}"] for i in range(3)] |
|
df = pd.DataFrame(data, columns=["A", "B"]) |
|
|
|
grouped = df.groupby("A", as_index=False) |
|
|
|
result = grouped._positional_selector[::step] |
|
|
|
data = [["x", f"x{i}"] for i in range(0, 5, step)] |
|
data += [["y", f"y{i}"] for i in range(0, 4, step)] |
|
data += [["z", f"z{i}"] for i in range(0, 3, step)] |
|
|
|
index = [0 + i for i in range(0, 5, step)] |
|
index += [5 + i for i in range(0, 4, step)] |
|
index += [9 + i for i in range(0, 3, step)] |
|
|
|
expected = pd.DataFrame(data, columns=["A", "B"], index=index) |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
@pytest.fixture() |
|
def column_group_df(): |
|
return pd.DataFrame( |
|
[[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]], |
|
columns=["A", "B", "C", "D", "E", "F", "G"], |
|
) |
|
|
|
|
|
def test_column_axis(column_group_df): |
|
msg = "DataFrame.groupby with axis=1" |
|
with tm.assert_produces_warning(FutureWarning, match=msg): |
|
g = column_group_df.groupby(column_group_df.iloc[1], axis=1) |
|
result = g._positional_selector[1:-1] |
|
expected = column_group_df.iloc[:, [1, 3]] |
|
|
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_columns_on_iter(): |
|
|
|
df = pd.DataFrame({k: range(10) for k in "ABC"}) |
|
|
|
|
|
cols = ["A", "B"] |
|
for _, dg in df.groupby(df.A < 4)[cols]: |
|
tm.assert_index_equal(dg.columns, pd.Index(cols)) |
|
assert "C" not in dg.columns |
|
|
|
|
|
@pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array]) |
|
def test_groupby_duplicated_columns(func): |
|
|
|
df = pd.DataFrame( |
|
{ |
|
"A": [1, 2], |
|
"B": [3, 3], |
|
"C": ["G", "G"], |
|
} |
|
) |
|
result = df.groupby("C")[func(["A", "B", "A"])].mean() |
|
expected = pd.DataFrame( |
|
[[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C") |
|
) |
|
tm.assert_frame_equal(result, expected) |
|
|
|
|
|
def test_groupby_get_nonexisting_groups(): |
|
|
|
df = pd.DataFrame( |
|
data={ |
|
"A": ["a1", "a2", None], |
|
"B": ["b1", "b2", "b1"], |
|
"val": [1, 2, 3], |
|
} |
|
) |
|
grps = df.groupby(by=["A", "B"]) |
|
|
|
msg = "('a2', 'b1')" |
|
with pytest.raises(KeyError, match=msg): |
|
grps.get_group(("a2", "b1")) |
|
|