|
import datetime |
|
import decimal |
|
import re |
|
|
|
import numpy as np |
|
import pytest |
|
import pytz |
|
|
|
import pandas as pd |
|
import pandas._testing as tm |
|
from pandas.api.extensions import register_extension_dtype |
|
from pandas.arrays import ( |
|
BooleanArray, |
|
DatetimeArray, |
|
FloatingArray, |
|
IntegerArray, |
|
IntervalArray, |
|
SparseArray, |
|
TimedeltaArray, |
|
) |
|
from pandas.core.arrays import ( |
|
NumpyExtensionArray, |
|
period_array, |
|
) |
|
from pandas.tests.extension.decimal import ( |
|
DecimalArray, |
|
DecimalDtype, |
|
to_decimal, |
|
) |
|
|
|
|
|
@pytest.mark.parametrize("dtype_unit", ["M8[h]", "M8[m]", "m8[h]", "M8[m]"]) |
|
def test_dt64_array(dtype_unit): |
|
|
|
dtype_var = np.dtype(dtype_unit) |
|
msg = ( |
|
r"datetime64 and timedelta64 dtype resolutions other than " |
|
r"'s', 'ms', 'us', and 'ns' are deprecated. " |
|
r"In future releases passing unsupported resolutions will " |
|
r"raise an exception." |
|
) |
|
with tm.assert_produces_warning(FutureWarning, match=re.escape(msg)): |
|
pd.array([], dtype=dtype_var) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"data, dtype, expected", |
|
[ |
|
|
|
([], None, FloatingArray._from_sequence([], dtype="Float64")), |
|
([1, 2], None, IntegerArray._from_sequence([1, 2], dtype="Int64")), |
|
([1, 2], object, NumpyExtensionArray(np.array([1, 2], dtype=object))), |
|
( |
|
[1, 2], |
|
np.dtype("float32"), |
|
NumpyExtensionArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))), |
|
), |
|
( |
|
np.array([], dtype=object), |
|
None, |
|
NumpyExtensionArray(np.array([], dtype=object)), |
|
), |
|
( |
|
np.array([1, 2], dtype="int64"), |
|
None, |
|
IntegerArray._from_sequence([1, 2], dtype="Int64"), |
|
), |
|
( |
|
np.array([1.0, 2.0], dtype="float64"), |
|
None, |
|
FloatingArray._from_sequence([1.0, 2.0], dtype="Float64"), |
|
), |
|
|
|
([1, 2], "float32", NumpyExtensionArray(np.array([1, 2], dtype="float32"))), |
|
([1, 2], "int64", NumpyExtensionArray(np.array([1, 2], dtype=np.int64))), |
|
|
|
|
|
( |
|
np.array([1, 2], dtype=np.float16), |
|
None, |
|
NumpyExtensionArray(np.array([1, 2], dtype=np.float16)), |
|
), |
|
|
|
( |
|
NumpyExtensionArray(np.array([1, 2], dtype=np.int32)), |
|
None, |
|
NumpyExtensionArray(np.array([1, 2], dtype=np.int32)), |
|
), |
|
|
|
( |
|
[pd.Period("2000", "D"), pd.Period("2001", "D")], |
|
"Period[D]", |
|
period_array(["2000", "2001"], freq="D"), |
|
), |
|
|
|
( |
|
[pd.Period("2000", "D")], |
|
pd.PeriodDtype("D"), |
|
period_array(["2000"], freq="D"), |
|
), |
|
|
|
( |
|
[1, 2], |
|
np.dtype("datetime64[ns]"), |
|
DatetimeArray._from_sequence( |
|
np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]" |
|
), |
|
), |
|
( |
|
[1, 2], |
|
np.dtype("datetime64[s]"), |
|
DatetimeArray._from_sequence( |
|
np.array([1, 2], dtype="M8[s]"), dtype="M8[s]" |
|
), |
|
), |
|
( |
|
np.array([1, 2], dtype="datetime64[ns]"), |
|
None, |
|
DatetimeArray._from_sequence( |
|
np.array([1, 2], dtype="M8[ns]"), dtype="M8[ns]" |
|
), |
|
), |
|
( |
|
pd.DatetimeIndex(["2000", "2001"]), |
|
np.dtype("datetime64[ns]"), |
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), |
|
), |
|
( |
|
pd.DatetimeIndex(["2000", "2001"]), |
|
None, |
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), |
|
), |
|
( |
|
["2000", "2001"], |
|
np.dtype("datetime64[ns]"), |
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), |
|
), |
|
|
|
( |
|
["2000", "2001"], |
|
pd.DatetimeTZDtype(tz="CET"), |
|
DatetimeArray._from_sequence( |
|
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET") |
|
), |
|
), |
|
|
|
( |
|
["1h", "2h"], |
|
np.dtype("timedelta64[ns]"), |
|
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"), |
|
), |
|
( |
|
pd.TimedeltaIndex(["1h", "2h"]), |
|
np.dtype("timedelta64[ns]"), |
|
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"), |
|
), |
|
( |
|
np.array([1, 2], dtype="m8[s]"), |
|
np.dtype("timedelta64[s]"), |
|
TimedeltaArray._from_sequence( |
|
np.array([1, 2], dtype="m8[s]"), dtype="m8[s]" |
|
), |
|
), |
|
( |
|
pd.TimedeltaIndex(["1h", "2h"]), |
|
None, |
|
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"), |
|
), |
|
( |
|
|
|
TimedeltaArray._simple_new( |
|
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]") |
|
), |
|
None, |
|
TimedeltaArray._simple_new( |
|
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]") |
|
), |
|
), |
|
( |
|
|
|
TimedeltaArray._simple_new( |
|
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]") |
|
), |
|
np.dtype("m8[s]"), |
|
TimedeltaArray._simple_new( |
|
np.arange(5, dtype=np.int64).view("m8[s]"), dtype=np.dtype("m8[s]") |
|
), |
|
), |
|
|
|
(["a", "b"], "category", pd.Categorical(["a", "b"])), |
|
( |
|
["a", "b"], |
|
pd.CategoricalDtype(None, ordered=True), |
|
pd.Categorical(["a", "b"], ordered=True), |
|
), |
|
|
|
( |
|
[pd.Interval(1, 2), pd.Interval(3, 4)], |
|
"interval", |
|
IntervalArray.from_tuples([(1, 2), (3, 4)]), |
|
), |
|
|
|
([0, 1], "Sparse[int64]", SparseArray([0, 1], dtype="int64")), |
|
|
|
([1, None], "Int16", pd.array([1, None], dtype="Int16")), |
|
( |
|
pd.Series([1, 2]), |
|
None, |
|
NumpyExtensionArray(np.array([1, 2], dtype=np.int64)), |
|
), |
|
|
|
( |
|
["a", None], |
|
"string", |
|
pd.StringDtype() |
|
.construct_array_type() |
|
._from_sequence(["a", None], dtype=pd.StringDtype()), |
|
), |
|
( |
|
["a", None], |
|
pd.StringDtype(), |
|
pd.StringDtype() |
|
.construct_array_type() |
|
._from_sequence(["a", None], dtype=pd.StringDtype()), |
|
), |
|
|
|
( |
|
[True, None], |
|
"boolean", |
|
BooleanArray._from_sequence([True, None], dtype="boolean"), |
|
), |
|
( |
|
[True, None], |
|
pd.BooleanDtype(), |
|
BooleanArray._from_sequence([True, None], dtype="boolean"), |
|
), |
|
|
|
(pd.Index([1, 2]), None, NumpyExtensionArray(np.array([1, 2], dtype=np.int64))), |
|
|
|
( |
|
pd.Series(pd.Categorical(["a", "b"], categories=["a", "b", "c"])), |
|
None, |
|
pd.Categorical(["a", "b"], categories=["a", "b", "c"]), |
|
), |
|
|
|
([decimal.Decimal(0), decimal.Decimal(1)], "decimal", to_decimal([0, 1])), |
|
|
|
( |
|
period_array(["2000", "2001"], freq="D"), |
|
"category", |
|
pd.Categorical([pd.Period("2000", "D"), pd.Period("2001", "D")]), |
|
), |
|
], |
|
) |
|
def test_array(data, dtype, expected): |
|
result = pd.array(data, dtype=dtype) |
|
tm.assert_equal(result, expected) |
|
|
|
|
|
def test_array_copy(): |
|
a = np.array([1, 2]) |
|
|
|
b = pd.array(a, dtype=a.dtype) |
|
assert not tm.shares_memory(a, b) |
|
|
|
|
|
b = pd.array(a, dtype=a.dtype, copy=True) |
|
assert not tm.shares_memory(a, b) |
|
|
|
|
|
b = pd.array(a, dtype=a.dtype, copy=False) |
|
assert tm.shares_memory(a, b) |
|
|
|
|
|
cet = pytz.timezone("CET") |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"data, expected", |
|
[ |
|
|
|
( |
|
[pd.Period("2000", "D"), pd.Period("2001", "D")], |
|
period_array(["2000", "2001"], freq="D"), |
|
), |
|
|
|
([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])), |
|
|
|
( |
|
[pd.Timestamp("2000"), pd.Timestamp("2001")], |
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), |
|
), |
|
( |
|
[datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], |
|
DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), |
|
), |
|
( |
|
np.array([1, 2], dtype="M8[ns]"), |
|
DatetimeArray._from_sequence(np.array([1, 2], dtype="M8[ns]")), |
|
), |
|
( |
|
np.array([1, 2], dtype="M8[us]"), |
|
DatetimeArray._simple_new( |
|
np.array([1, 2], dtype="M8[us]"), dtype=np.dtype("M8[us]") |
|
), |
|
), |
|
|
|
( |
|
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], |
|
DatetimeArray._from_sequence( |
|
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns") |
|
), |
|
), |
|
( |
|
[ |
|
datetime.datetime(2000, 1, 1, tzinfo=cet), |
|
datetime.datetime(2001, 1, 1, tzinfo=cet), |
|
], |
|
DatetimeArray._from_sequence( |
|
["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns") |
|
), |
|
), |
|
|
|
( |
|
[pd.Timedelta("1h"), pd.Timedelta("2h")], |
|
TimedeltaArray._from_sequence(["1h", "2h"], dtype="m8[ns]"), |
|
), |
|
( |
|
np.array([1, 2], dtype="m8[ns]"), |
|
TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[ns]")), |
|
), |
|
( |
|
np.array([1, 2], dtype="m8[us]"), |
|
TimedeltaArray._from_sequence(np.array([1, 2], dtype="m8[us]")), |
|
), |
|
|
|
([1, 2], IntegerArray._from_sequence([1, 2], dtype="Int64")), |
|
([1, None], IntegerArray._from_sequence([1, None], dtype="Int64")), |
|
([1, pd.NA], IntegerArray._from_sequence([1, pd.NA], dtype="Int64")), |
|
([1, np.nan], IntegerArray._from_sequence([1, np.nan], dtype="Int64")), |
|
|
|
([0.1, 0.2], FloatingArray._from_sequence([0.1, 0.2], dtype="Float64")), |
|
([0.1, None], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")), |
|
([0.1, np.nan], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")), |
|
([0.1, pd.NA], FloatingArray._from_sequence([0.1, pd.NA], dtype="Float64")), |
|
|
|
([1.0, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")), |
|
([1.0, None], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")), |
|
([1.0, np.nan], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")), |
|
([1.0, pd.NA], FloatingArray._from_sequence([1.0, pd.NA], dtype="Float64")), |
|
|
|
([1, 2.0], FloatingArray._from_sequence([1.0, 2.0], dtype="Float64")), |
|
( |
|
[1, np.nan, 2.0], |
|
FloatingArray._from_sequence([1.0, None, 2.0], dtype="Float64"), |
|
), |
|
|
|
( |
|
["a", "b"], |
|
pd.StringDtype() |
|
.construct_array_type() |
|
._from_sequence(["a", "b"], dtype=pd.StringDtype()), |
|
), |
|
( |
|
["a", None], |
|
pd.StringDtype() |
|
.construct_array_type() |
|
._from_sequence(["a", None], dtype=pd.StringDtype()), |
|
), |
|
|
|
([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")), |
|
([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")), |
|
], |
|
) |
|
def test_array_inference(data, expected): |
|
result = pd.array(data) |
|
tm.assert_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize( |
|
"data", |
|
[ |
|
|
|
[pd.Period("2000", "D"), pd.Period("2001", "Y")], |
|
|
|
[pd.Interval(0, 1, closed="left"), pd.Interval(1, 2, closed="right")], |
|
|
|
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000", tz="UTC")], |
|
|
|
[pd.Timestamp("2000", tz="CET"), pd.Timestamp("2000")], |
|
np.array([pd.Timestamp("2000"), pd.Timestamp("2000", tz="CET")]), |
|
], |
|
) |
|
def test_array_inference_fails(data): |
|
result = pd.array(data) |
|
expected = NumpyExtensionArray(np.array(data, dtype=object)) |
|
tm.assert_extension_array_equal(result, expected) |
|
|
|
|
|
@pytest.mark.parametrize("data", [np.array(0)]) |
|
def test_nd_raises(data): |
|
with pytest.raises(ValueError, match="NumpyExtensionArray must be 1-dimensional"): |
|
pd.array(data, dtype="int64") |
|
|
|
|
|
def test_scalar_raises(): |
|
with pytest.raises(ValueError, match="Cannot pass scalar '1'"): |
|
pd.array(1) |
|
|
|
|
|
def test_dataframe_raises(): |
|
|
|
df = pd.DataFrame([[1, 2], [3, 4]], columns=["A", "B"]) |
|
msg = "Cannot pass DataFrame to 'pandas.array'" |
|
with pytest.raises(TypeError, match=msg): |
|
pd.array(df) |
|
|
|
|
|
def test_bounds_check(): |
|
|
|
with pytest.raises( |
|
TypeError, match=r"cannot safely cast non-equivalent int(32|64) to uint16" |
|
): |
|
pd.array([-1, 2, 3], dtype="UInt16") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@register_extension_dtype |
|
class DecimalDtype2(DecimalDtype): |
|
name = "decimal2" |
|
|
|
@classmethod |
|
def construct_array_type(cls): |
|
""" |
|
Return the array type associated with this dtype. |
|
|
|
Returns |
|
------- |
|
type |
|
""" |
|
return DecimalArray2 |
|
|
|
|
|
class DecimalArray2(DecimalArray): |
|
@classmethod |
|
def _from_sequence(cls, scalars, *, dtype=None, copy=False): |
|
if isinstance(scalars, (pd.Series, pd.Index)): |
|
raise TypeError("scalars should not be of type pd.Series or pd.Index") |
|
|
|
return super()._from_sequence(scalars, dtype=dtype, copy=copy) |
|
|
|
|
|
def test_array_unboxes(index_or_series): |
|
box = index_or_series |
|
|
|
data = box([decimal.Decimal("1"), decimal.Decimal("2")]) |
|
dtype = DecimalDtype2() |
|
|
|
with pytest.raises( |
|
TypeError, match="scalars should not be of type pd.Series or pd.Index" |
|
): |
|
DecimalArray2._from_sequence(data, dtype=dtype) |
|
|
|
result = pd.array(data, dtype="decimal2") |
|
expected = DecimalArray2._from_sequence(data.values, dtype=dtype) |
|
tm.assert_equal(result, expected) |
|
|
|
|
|
def test_array_to_numpy_na(): |
|
|
|
arr = pd.array([pd.NA, 1], dtype="string[python]") |
|
result = arr.to_numpy(na_value=True, dtype=bool) |
|
expected = np.array([True, True]) |
|
tm.assert_numpy_array_equal(result, expected) |
|
|