File size: 6,353 Bytes
7885a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import numpy as np
import pytest
from pandas import (
CategoricalDtype,
DataFrame,
Index,
MultiIndex,
Series,
_testing as tm,
option_context,
)
from pandas.core.strings.accessor import StringMethods
# subset of the full set from pandas/conftest.py
_any_allowed_skipna_inferred_dtype = [
("string", ["a", np.nan, "c"]),
("bytes", [b"a", np.nan, b"c"]),
("empty", [np.nan, np.nan, np.nan]),
("empty", []),
("mixed-integer", ["a", np.nan, 2]),
]
ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id
@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
def any_allowed_skipna_inferred_dtype(request):
"""
Fixture for all (inferred) dtypes allowed in StringMethods.__init__
The covered (inferred) types are:
* 'string'
* 'empty'
* 'bytes'
* 'mixed'
* 'mixed-integer'
Returns
-------
inferred_dtype : str
The string for the inferred dtype from _libs.lib.infer_dtype
values : np.ndarray
An array of object dtype that will be inferred to have
`inferred_dtype`
Examples
--------
>>> from pandas._libs import lib
>>>
>>> def test_something(any_allowed_skipna_inferred_dtype):
... inferred_dtype, values = any_allowed_skipna_inferred_dtype
... # will pass
... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
...
... # constructor for .str-accessor will also pass
... Series(values).str
"""
inferred_dtype, values = request.param
values = np.array(values, dtype=object) # object dtype to avoid casting
# correctness of inference tested in tests/dtypes/test_inference.py
return inferred_dtype, values
def test_api(any_string_dtype):
# GH 6106, GH 9322
assert Series.str is StringMethods
assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods)
def test_api_mi_raises():
# GH 23679
mi = MultiIndex.from_arrays([["a", "b", "c"]])
msg = "Can only use .str accessor with Index, not MultiIndex"
with pytest.raises(AttributeError, match=msg):
mi.str
assert not hasattr(mi, "str")
@pytest.mark.parametrize("dtype", [object, "category"])
def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype):
# one instance of parametrized fixture
box = index_or_series
inferred_dtype, values = any_skipna_inferred_dtype
t = box(values, dtype=dtype) # explicit dtype to avoid casting
types_passing_constructor = [
"string",
"unicode",
"empty",
"bytes",
"mixed",
"mixed-integer",
]
if inferred_dtype in types_passing_constructor:
# GH 6106
assert isinstance(t.str, StringMethods)
else:
# GH 9184, GH 23011, GH 23163
msg = "Can only use .str accessor with string values.*"
with pytest.raises(AttributeError, match=msg):
t.str
assert not hasattr(t, "str")
@pytest.mark.parametrize("dtype", [object, "category"])
def test_api_per_method(
index_or_series,
dtype,
any_allowed_skipna_inferred_dtype,
any_string_method,
request,
):
# this test does not check correctness of the different methods,
# just that the methods work on the specified (inferred) dtypes,
# and raise on all others
box = index_or_series
# one instance of each parametrized fixture
inferred_dtype, values = any_allowed_skipna_inferred_dtype
method_name, args, kwargs = any_string_method
reason = None
if box is Index and values.size == 0:
if method_name in ["partition", "rpartition"] and kwargs.get("expand", True):
raises = TypeError
reason = "Method cannot deal with empty Index"
elif method_name == "split" and kwargs.get("expand", None):
raises = TypeError
reason = "Split fails on empty Series when expand=True"
elif method_name == "get_dummies":
raises = ValueError
reason = "Need to fortify get_dummies corner cases"
elif (
box is Index
and inferred_dtype == "empty"
and dtype == object
and method_name == "get_dummies"
):
raises = ValueError
reason = "Need to fortify get_dummies corner cases"
if reason is not None:
mark = pytest.mark.xfail(raises=raises, reason=reason)
request.applymarker(mark)
t = box(values, dtype=dtype) # explicit dtype to avoid casting
method = getattr(t.str, method_name)
bytes_allowed = method_name in ["decode", "get", "len", "slice"]
# as of v0.23.4, all methods except 'cat' are very lenient with the
# allowed data types, just returning NaN for entries that error.
# This could be changed with an 'errors'-kwarg to the `str`-accessor,
# see discussion in GH 13877
mixed_allowed = method_name not in ["cat"]
allowed_types = (
["string", "unicode", "empty"]
+ ["bytes"] * bytes_allowed
+ ["mixed", "mixed-integer"] * mixed_allowed
)
if inferred_dtype in allowed_types:
# xref GH 23555, GH 23556
with option_context("future.no_silent_downcasting", True):
method(*args, **kwargs) # works!
else:
# GH 23011, GH 23163
msg = (
f"Cannot use .str.{method_name} with values of "
f"inferred dtype {repr(inferred_dtype)}."
)
with pytest.raises(TypeError, match=msg):
method(*args, **kwargs)
def test_api_for_categorical(any_string_method, any_string_dtype):
# https://github.com/pandas-dev/pandas/issues/10661
s = Series(list("aabb"), dtype=any_string_dtype)
s = s + " " + s
c = s.astype("category")
c = c.astype(CategoricalDtype(c.dtype.categories.astype("object")))
assert isinstance(c.str, StringMethods)
method_name, args, kwargs = any_string_method
result = getattr(c.str, method_name)(*args, **kwargs)
expected = getattr(s.astype("object").str, method_name)(*args, **kwargs)
if isinstance(result, DataFrame):
tm.assert_frame_equal(result, expected)
elif isinstance(result, Series):
tm.assert_series_equal(result, expected)
else:
# str.cat(others=None) returns string, for example
assert result == expected
|