Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

6.35 kB

	import numpy as np
	import pytest

	from pandas import (
	CategoricalDtype,
	DataFrame,
	Index,
	MultiIndex,
	Series,
	_testing as tm,
	option_context,
	)
	from pandas.core.strings.accessor import StringMethods

	# subset of the full set from pandas/conftest.py
	_any_allowed_skipna_inferred_dtype = [
	("string", ["a", np.nan, "c"]),
	("bytes", [b"a", np.nan, b"c"]),
	("empty", [np.nan, np.nan, np.nan]),
	("empty", []),
	("mixed-integer", ["a", np.nan, 2]),
	]
	ids, _ = zip(*_any_allowed_skipna_inferred_dtype) # use inferred type as id


	@pytest.fixture(params=_any_allowed_skipna_inferred_dtype, ids=ids)
	def any_allowed_skipna_inferred_dtype(request):
	"""
	Fixture for all (inferred) dtypes allowed in StringMethods.__init__

	The covered (inferred) types are:
	* 'string'
	* 'empty'
	* 'bytes'
	* 'mixed'
	* 'mixed-integer'

	Returns
	-------
	inferred_dtype : str
	The string for the inferred dtype from _libs.lib.infer_dtype
	values : np.ndarray
	An array of object dtype that will be inferred to have
	`inferred_dtype`

	Examples
	--------
	>>> from pandas._libs import lib
	>>>
	>>> def test_something(any_allowed_skipna_inferred_dtype):
	... inferred_dtype, values = any_allowed_skipna_inferred_dtype
	... # will pass
	... assert lib.infer_dtype(values, skipna=True) == inferred_dtype
	...
	... # constructor for .str-accessor will also pass
	... Series(values).str
	"""
	inferred_dtype, values = request.param
	values = np.array(values, dtype=object) # object dtype to avoid casting

	# correctness of inference tested in tests/dtypes/test_inference.py
	return inferred_dtype, values


	def test_api(any_string_dtype):
	# GH 6106, GH 9322
	assert Series.str is StringMethods
	assert isinstance(Series([""], dtype=any_string_dtype).str, StringMethods)


	def test_api_mi_raises():
	# GH 23679
	mi = MultiIndex.from_arrays([["a", "b", "c"]])
	msg = "Can only use .str accessor with Index, not MultiIndex"
	with pytest.raises(AttributeError, match=msg):
	mi.str
	assert not hasattr(mi, "str")


	@pytest.mark.parametrize("dtype", [object, "category"])
	def test_api_per_dtype(index_or_series, dtype, any_skipna_inferred_dtype):
	# one instance of parametrized fixture
	box = index_or_series
	inferred_dtype, values = any_skipna_inferred_dtype

	t = box(values, dtype=dtype) # explicit dtype to avoid casting

	types_passing_constructor = [
	"string",
	"unicode",
	"empty",
	"bytes",
	"mixed",
	"mixed-integer",
	]
	if inferred_dtype in types_passing_constructor:
	# GH 6106
	assert isinstance(t.str, StringMethods)
	else:
	# GH 9184, GH 23011, GH 23163
	msg = "Can only use .str accessor with string values.*"
	with pytest.raises(AttributeError, match=msg):
	t.str
	assert not hasattr(t, "str")


	@pytest.mark.parametrize("dtype", [object, "category"])
	def test_api_per_method(
	index_or_series,
	dtype,
	any_allowed_skipna_inferred_dtype,
	any_string_method,
	request,
	):
	# this test does not check correctness of the different methods,
	# just that the methods work on the specified (inferred) dtypes,
	# and raise on all others
	box = index_or_series

	# one instance of each parametrized fixture
	inferred_dtype, values = any_allowed_skipna_inferred_dtype
	method_name, args, kwargs = any_string_method

	reason = None
	if box is Index and values.size == 0:
	if method_name in ["partition", "rpartition"] and kwargs.get("expand", True):
	raises = TypeError
	reason = "Method cannot deal with empty Index"
	elif method_name == "split" and kwargs.get("expand", None):
	raises = TypeError
	reason = "Split fails on empty Series when expand=True"
	elif method_name == "get_dummies":
	raises = ValueError
	reason = "Need to fortify get_dummies corner cases"

	elif (
	box is Index
	and inferred_dtype == "empty"
	and dtype == object
	and method_name == "get_dummies"
	):
	raises = ValueError
	reason = "Need to fortify get_dummies corner cases"

	if reason is not None:
	mark = pytest.mark.xfail(raises=raises, reason=reason)
	request.applymarker(mark)

	t = box(values, dtype=dtype) # explicit dtype to avoid casting
	method = getattr(t.str, method_name)

	bytes_allowed = method_name in ["decode", "get", "len", "slice"]
	# as of v0.23.4, all methods except 'cat' are very lenient with the
	# allowed data types, just returning NaN for entries that error.
	# This could be changed with an 'errors'-kwarg to the `str`-accessor,
	# see discussion in GH 13877
	mixed_allowed = method_name not in ["cat"]

	allowed_types = (
	["string", "unicode", "empty"]
	+ ["bytes"] * bytes_allowed
	+ ["mixed", "mixed-integer"] * mixed_allowed
	)

	if inferred_dtype in allowed_types:
	# xref GH 23555, GH 23556
	with option_context("future.no_silent_downcasting", True):
	method(args, *kwargs) # works!
	else:
	# GH 23011, GH 23163
	msg = (
	f"Cannot use .str.{method_name} with values of "
	f"inferred dtype {repr(inferred_dtype)}."
	)
	with pytest.raises(TypeError, match=msg):
	method(args, *kwargs)


	def test_api_for_categorical(any_string_method, any_string_dtype):
	# https://github.com/pandas-dev/pandas/issues/10661
	s = Series(list("aabb"), dtype=any_string_dtype)
	s = s + " " + s
	c = s.astype("category")
	c = c.astype(CategoricalDtype(c.dtype.categories.astype("object")))
	assert isinstance(c.str, StringMethods)

	method_name, args, kwargs = any_string_method

	result = getattr(c.str, method_name)(args, *kwargs)
	expected = getattr(s.astype("object").str, method_name)(args, *kwargs)

	if isinstance(result, DataFrame):
	tm.assert_frame_equal(result, expected)
	elif isinstance(result, Series):
	tm.assert_series_equal(result, expected)
	else:
	# str.cat(others=None) returns string, for example
	assert result == expected