|
import numpy as np |
|
import pytest |
|
|
|
from pandas._libs import groupby as libgroupby |
|
from pandas._libs.groupby import ( |
|
group_cumprod, |
|
group_cumsum, |
|
group_mean, |
|
group_sum, |
|
group_var, |
|
) |
|
|
|
from pandas.core.dtypes.common import ensure_platform_int |
|
|
|
from pandas import isna |
|
import pandas._testing as tm |
|
|
|
|
|
class GroupVarTestMixin: |
|
def test_group_var_generic_1d(self): |
|
prng = np.random.default_rng(2) |
|
|
|
out = (np.nan * np.ones((5, 1))).astype(self.dtype) |
|
counts = np.zeros(5, dtype="int64") |
|
values = 10 * prng.random((15, 1)).astype(self.dtype) |
|
labels = np.tile(np.arange(5), (3,)).astype("intp") |
|
|
|
expected_out = ( |
|
np.squeeze(values).reshape((5, 3), order="F").std(axis=1, ddof=1) ** 2 |
|
)[:, np.newaxis] |
|
expected_counts = counts + 3 |
|
|
|
self.algo(out, counts, values, labels) |
|
assert np.allclose(out, expected_out, self.rtol) |
|
tm.assert_numpy_array_equal(counts, expected_counts) |
|
|
|
def test_group_var_generic_1d_flat_labels(self): |
|
prng = np.random.default_rng(2) |
|
|
|
out = (np.nan * np.ones((1, 1))).astype(self.dtype) |
|
counts = np.zeros(1, dtype="int64") |
|
values = 10 * prng.random((5, 1)).astype(self.dtype) |
|
labels = np.zeros(5, dtype="intp") |
|
|
|
expected_out = np.array([[values.std(ddof=1) ** 2]]) |
|
expected_counts = counts + 5 |
|
|
|
self.algo(out, counts, values, labels) |
|
|
|
assert np.allclose(out, expected_out, self.rtol) |
|
tm.assert_numpy_array_equal(counts, expected_counts) |
|
|
|
def test_group_var_generic_2d_all_finite(self): |
|
prng = np.random.default_rng(2) |
|
|
|
out = (np.nan * np.ones((5, 2))).astype(self.dtype) |
|
counts = np.zeros(5, dtype="int64") |
|
values = 10 * prng.random((10, 2)).astype(self.dtype) |
|
labels = np.tile(np.arange(5), (2,)).astype("intp") |
|
|
|
expected_out = np.std(values.reshape(2, 5, 2), ddof=1, axis=0) ** 2 |
|
expected_counts = counts + 2 |
|
|
|
self.algo(out, counts, values, labels) |
|
assert np.allclose(out, expected_out, self.rtol) |
|
tm.assert_numpy_array_equal(counts, expected_counts) |
|
|
|
def test_group_var_generic_2d_some_nan(self): |
|
prng = np.random.default_rng(2) |
|
|
|
out = (np.nan * np.ones((5, 2))).astype(self.dtype) |
|
counts = np.zeros(5, dtype="int64") |
|
values = 10 * prng.random((10, 2)).astype(self.dtype) |
|
values[:, 1] = np.nan |
|
labels = np.tile(np.arange(5), (2,)).astype("intp") |
|
|
|
expected_out = np.vstack( |
|
[ |
|
values[:, 0].reshape(5, 2, order="F").std(ddof=1, axis=1) ** 2, |
|
np.nan * np.ones(5), |
|
] |
|
).T.astype(self.dtype) |
|
expected_counts = counts + 2 |
|
|
|
self.algo(out, counts, values, labels) |
|
tm.assert_almost_equal(out, expected_out, rtol=0.5e-06) |
|
tm.assert_numpy_array_equal(counts, expected_counts) |
|
|
|
def test_group_var_constant(self): |
|
|
|
|
|
out = np.array([[np.nan]], dtype=self.dtype) |
|
counts = np.array([0], dtype="int64") |
|
values = 0.832845131556193 * np.ones((3, 1), dtype=self.dtype) |
|
labels = np.zeros(3, dtype="intp") |
|
|
|
self.algo(out, counts, values, labels) |
|
|
|
assert counts[0] == 3 |
|
assert out[0, 0] >= 0 |
|
tm.assert_almost_equal(out[0, 0], 0.0) |
|
|
|
|
|
class TestGroupVarFloat64(GroupVarTestMixin): |
|
__test__ = True |
|
|
|
algo = staticmethod(group_var) |
|
dtype = np.float64 |
|
rtol = 1e-5 |
|
|
|
def test_group_var_large_inputs(self): |
|
prng = np.random.default_rng(2) |
|
|
|
out = np.array([[np.nan]], dtype=self.dtype) |
|
counts = np.array([0], dtype="int64") |
|
values = (prng.random(10**6) + 10**12).astype(self.dtype) |
|
values.shape = (10**6, 1) |
|
labels = np.zeros(10**6, dtype="intp") |
|
|
|
self.algo(out, counts, values, labels) |
|
|
|
assert counts[0] == 10**6 |
|
tm.assert_almost_equal(out[0, 0], 1.0 / 12, rtol=0.5e-3) |
|
|
|
|
|
class TestGroupVarFloat32(GroupVarTestMixin): |
|
__test__ = True |
|
|
|
algo = staticmethod(group_var) |
|
dtype = np.float32 |
|
rtol = 1e-2 |
|
|
|
|
|
@pytest.mark.parametrize("dtype", ["float32", "float64"]) |
|
def test_group_ohlc(dtype): |
|
obj = np.array(np.random.default_rng(2).standard_normal(20), dtype=dtype) |
|
|
|
bins = np.array([6, 12, 20]) |
|
out = np.zeros((3, 4), dtype) |
|
counts = np.zeros(len(out), dtype=np.int64) |
|
labels = ensure_platform_int(np.repeat(np.arange(3), np.diff(np.r_[0, bins]))) |
|
|
|
func = libgroupby.group_ohlc |
|
func(out, counts, obj[:, None], labels) |
|
|
|
def _ohlc(group): |
|
if isna(group).all(): |
|
return np.repeat(np.nan, 4) |
|
return [group[0], group.max(), group.min(), group[-1]] |
|
|
|
expected = np.array([_ohlc(obj[:6]), _ohlc(obj[6:12]), _ohlc(obj[12:])]) |
|
|
|
tm.assert_almost_equal(out, expected) |
|
tm.assert_numpy_array_equal(counts, np.array([6, 6, 8], dtype=np.int64)) |
|
|
|
obj[:6] = np.nan |
|
func(out, counts, obj[:, None], labels) |
|
expected[0] = np.nan |
|
tm.assert_almost_equal(out, expected) |
|
|
|
|
|
def _check_cython_group_transform_cumulative(pd_op, np_op, dtype): |
|
""" |
|
Check a group transform that executes a cumulative function. |
|
|
|
Parameters |
|
---------- |
|
pd_op : callable |
|
The pandas cumulative function. |
|
np_op : callable |
|
The analogous one in NumPy. |
|
dtype : type |
|
The specified dtype of the data. |
|
""" |
|
is_datetimelike = False |
|
|
|
data = np.array([[1], [2], [3], [4]], dtype=dtype) |
|
answer = np.zeros_like(data) |
|
|
|
labels = np.array([0, 0, 0, 0], dtype=np.intp) |
|
ngroups = 1 |
|
pd_op(answer, data, labels, ngroups, is_datetimelike) |
|
|
|
tm.assert_numpy_array_equal(np_op(data), answer[:, 0], check_dtype=False) |
|
|
|
|
|
@pytest.mark.parametrize("np_dtype", ["int64", "uint64", "float32", "float64"]) |
|
def test_cython_group_transform_cumsum(np_dtype): |
|
|
|
dtype = np.dtype(np_dtype).type |
|
pd_op, np_op = group_cumsum, np.cumsum |
|
_check_cython_group_transform_cumulative(pd_op, np_op, dtype) |
|
|
|
|
|
def test_cython_group_transform_cumprod(): |
|
|
|
dtype = np.float64 |
|
pd_op, np_op = group_cumprod, np.cumprod |
|
_check_cython_group_transform_cumulative(pd_op, np_op, dtype) |
|
|
|
|
|
def test_cython_group_transform_algos(): |
|
|
|
is_datetimelike = False |
|
|
|
|
|
labels = np.array([0, 0, 0, 0, 0], dtype=np.intp) |
|
ngroups = 1 |
|
|
|
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64") |
|
actual = np.zeros_like(data) |
|
actual.fill(np.nan) |
|
group_cumprod(actual, data, labels, ngroups, is_datetimelike) |
|
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64") |
|
tm.assert_numpy_array_equal(actual[:, 0], expected) |
|
|
|
actual = np.zeros_like(data) |
|
actual.fill(np.nan) |
|
group_cumsum(actual, data, labels, ngroups, is_datetimelike) |
|
expected = np.array([1, 3, 6, np.nan, 10], dtype="float64") |
|
tm.assert_numpy_array_equal(actual[:, 0], expected) |
|
|
|
|
|
is_datetimelike = True |
|
data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None] |
|
actual = np.zeros_like(data, dtype="int64") |
|
group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike) |
|
expected = np.array( |
|
[ |
|
np.timedelta64(1, "ns"), |
|
np.timedelta64(2, "ns"), |
|
np.timedelta64(3, "ns"), |
|
np.timedelta64(4, "ns"), |
|
np.timedelta64(5, "ns"), |
|
] |
|
) |
|
tm.assert_numpy_array_equal(actual[:, 0].view("m8[ns]"), expected) |
|
|
|
|
|
def test_cython_group_mean_datetimelike(): |
|
actual = np.zeros(shape=(1, 1), dtype="float64") |
|
counts = np.array([0], dtype="int64") |
|
data = ( |
|
np.array( |
|
[np.timedelta64(2, "ns"), np.timedelta64(4, "ns"), np.timedelta64("NaT")], |
|
dtype="m8[ns]", |
|
)[:, None] |
|
.view("int64") |
|
.astype("float64") |
|
) |
|
labels = np.zeros(len(data), dtype=np.intp) |
|
|
|
group_mean(actual, counts, data, labels, is_datetimelike=True) |
|
|
|
tm.assert_numpy_array_equal(actual[:, 0], np.array([3], dtype="float64")) |
|
|
|
|
|
def test_cython_group_mean_wrong_min_count(): |
|
actual = np.zeros(shape=(1, 1), dtype="float64") |
|
counts = np.zeros(1, dtype="int64") |
|
data = np.zeros(1, dtype="float64")[:, None] |
|
labels = np.zeros(1, dtype=np.intp) |
|
|
|
with pytest.raises(AssertionError, match="min_count"): |
|
group_mean(actual, counts, data, labels, is_datetimelike=True, min_count=0) |
|
|
|
|
|
def test_cython_group_mean_not_datetimelike_but_has_NaT_values(): |
|
actual = np.zeros(shape=(1, 1), dtype="float64") |
|
counts = np.array([0], dtype="int64") |
|
data = ( |
|
np.array( |
|
[np.timedelta64("NaT"), np.timedelta64("NaT")], |
|
dtype="m8[ns]", |
|
)[:, None] |
|
.view("int64") |
|
.astype("float64") |
|
) |
|
labels = np.zeros(len(data), dtype=np.intp) |
|
|
|
group_mean(actual, counts, data, labels, is_datetimelike=False) |
|
|
|
tm.assert_numpy_array_equal( |
|
actual[:, 0], np.array(np.divide(np.add(data[0], data[1]), 2), dtype="float64") |
|
) |
|
|
|
|
|
def test_cython_group_mean_Inf_at_begining_and_end(): |
|
|
|
actual = np.array([[np.nan, np.nan], [np.nan, np.nan]], dtype="float64") |
|
counts = np.array([0, 0], dtype="int64") |
|
data = np.array( |
|
[[np.inf, 1.0], [1.0, 2.0], [2.0, 3.0], [3.0, 4.0], [4.0, 5.0], [5, np.inf]], |
|
dtype="float64", |
|
) |
|
labels = np.array([0, 1, 0, 1, 0, 1], dtype=np.intp) |
|
|
|
group_mean(actual, counts, data, labels, is_datetimelike=False) |
|
|
|
expected = np.array([[np.inf, 3], [3, np.inf]], dtype="float64") |
|
|
|
tm.assert_numpy_array_equal( |
|
actual, |
|
expected, |
|
) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"values, out", |
|
[ |
|
([[np.inf], [np.inf], [np.inf]], [[np.inf], [np.inf]]), |
|
([[np.inf], [np.inf], [-np.inf]], [[np.inf], [np.nan]]), |
|
([[np.inf], [-np.inf], [np.inf]], [[np.inf], [np.nan]]), |
|
([[np.inf], [-np.inf], [-np.inf]], [[np.inf], [-np.inf]]), |
|
], |
|
) |
|
def test_cython_group_sum_Inf_at_begining_and_end(values, out): |
|
|
|
actual = np.array([[np.nan], [np.nan]], dtype="float64") |
|
counts = np.array([0, 0], dtype="int64") |
|
data = np.array(values, dtype="float64") |
|
labels = np.array([0, 1, 1], dtype=np.intp) |
|
|
|
group_sum(actual, counts, data, labels, None, is_datetimelike=False) |
|
|
|
expected = np.array(out, dtype="float64") |
|
|
|
tm.assert_numpy_array_equal( |
|
actual, |
|
expected, |
|
) |
|
|