File size: 5,358 Bytes
7885a28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import numpy as np
import pytest

import pandas as pd
from pandas import (
    DataFrame,
    Index,
    date_range,
)
import pandas._testing as tm


@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_groupby_column_index_name_lost_fill_funcs(func):
    # GH: 29764 groupby loses index sometimes
    df = DataFrame(
        [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
        columns=Index(["type", "a", "b"], name="idx"),
    )
    df_grouped = df.groupby(["type"])[["a", "b"]]
    result = getattr(df_grouped, func)().columns
    expected = Index(["a", "b"], name="idx")
    tm.assert_index_equal(result, expected)


@pytest.mark.parametrize("func", ["ffill", "bfill"])
def test_groupby_fill_duplicate_column_names(func):
    # GH: 25610 ValueError with duplicate column names
    df1 = DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
    df2 = DataFrame({"field1": [1, np.nan, 4]})
    df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
    expected = DataFrame(
        [[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
    )
    result = getattr(df_grouped, func)()
    tm.assert_frame_equal(result, expected)


def test_ffill_missing_arguments():
    # GH 14955
    df = DataFrame({"a": [1, 2], "b": [1, 1]})
    msg = "DataFrameGroupBy.fillna is deprecated"
    with tm.assert_produces_warning(FutureWarning, match=msg):
        with pytest.raises(ValueError, match="Must specify a fill"):
            df.groupby("b").fillna()


@pytest.mark.parametrize(
    "method, expected", [("ffill", [None, "a", "a"]), ("bfill", ["a", "a", None])]
)
def test_fillna_with_string_dtype(method, expected):
    # GH 40250
    df = DataFrame({"a": pd.array([None, "a", None], dtype="string"), "b": [0, 0, 0]})
    grp = df.groupby("b")
    msg = "DataFrameGroupBy.fillna is deprecated"
    with tm.assert_produces_warning(FutureWarning, match=msg):
        result = grp.fillna(method=method)
    expected = DataFrame({"a": pd.array(expected, dtype="string")})
    tm.assert_frame_equal(result, expected)


def test_fill_consistency():
    # GH9221
    # pass thru keyword arguments to the generated wrapper
    # are set if the passed kw is None (only)
    df = DataFrame(
        index=pd.MultiIndex.from_product(
            [["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
        ),
        columns=Index(["1", "2"], name="id"),
    )
    df["1"] = [
        np.nan,
        1,
        np.nan,
        np.nan,
        11,
        np.nan,
        np.nan,
        2,
        np.nan,
        np.nan,
        22,
        np.nan,
    ]
    df["2"] = [
        np.nan,
        3,
        np.nan,
        np.nan,
        33,
        np.nan,
        np.nan,
        4,
        np.nan,
        np.nan,
        44,
        np.nan,
    ]

    msg = "The 'axis' keyword in DataFrame.groupby is deprecated"
    with tm.assert_produces_warning(FutureWarning, match=msg):
        expected = df.groupby(level=0, axis=0).fillna(method="ffill")

    msg = "DataFrame.groupby with axis=1 is deprecated"
    with tm.assert_produces_warning(FutureWarning, match=msg):
        result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("method", ["ffill", "bfill"])
@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize("has_nan_group", [True, False])
def test_ffill_handles_nan_groups(dropna, method, has_nan_group):
    # GH 34725

    df_without_nan_rows = DataFrame([(1, 0.1), (2, 0.2)])

    ridx = [-1, 0, -1, -1, 1, -1]
    df = df_without_nan_rows.reindex(ridx).reset_index(drop=True)

    group_b = np.nan if has_nan_group else "b"
    df["group_col"] = pd.Series(["a"] * 3 + [group_b] * 3)

    grouped = df.groupby(by="group_col", dropna=dropna)
    result = getattr(grouped, method)(limit=None)

    expected_rows = {
        ("ffill", True, True): [-1, 0, 0, -1, -1, -1],
        ("ffill", True, False): [-1, 0, 0, -1, 1, 1],
        ("ffill", False, True): [-1, 0, 0, -1, 1, 1],
        ("ffill", False, False): [-1, 0, 0, -1, 1, 1],
        ("bfill", True, True): [0, 0, -1, -1, -1, -1],
        ("bfill", True, False): [0, 0, -1, 1, 1, -1],
        ("bfill", False, True): [0, 0, -1, 1, 1, -1],
        ("bfill", False, False): [0, 0, -1, 1, 1, -1],
    }

    ridx = expected_rows.get((method, dropna, has_nan_group))
    expected = df_without_nan_rows.reindex(ridx).reset_index(drop=True)
    # columns are a 'take' on df.columns, which are object dtype
    expected.columns = expected.columns.astype(object)

    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("min_count, value", [(2, np.nan), (-1, 1.0)])
@pytest.mark.parametrize("func", ["first", "last", "max", "min"])
def test_min_count(func, min_count, value):
    # GH#37821
    df = DataFrame({"a": [1] * 3, "b": [1, np.nan, np.nan], "c": [np.nan] * 3})
    result = getattr(df.groupby("a"), func)(min_count=min_count)
    expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
    tm.assert_frame_equal(result, expected)


def test_indices_with_missing():
    # GH 9304
    df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
    g = df.groupby(["a", "b"])
    result = g.indices
    expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
    assert result == expected