Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

13.3 kB

	from textwrap import dedent

	import numpy as np
	import pytest

	from pandas.errors import (
	PyperclipException,
	PyperclipWindowsException,
	)

	import pandas as pd
	from pandas import (
	NA,
	DataFrame,
	Series,
	get_option,
	read_clipboard,
	)
	import pandas._testing as tm
	from pandas.core.arrays import (
	ArrowStringArray,
	StringArray,
	)

	from pandas.io.clipboard import (
	CheckedCall,
	_stringifyText,
	init_qt_clipboard,
	)


	def build_kwargs(sep, excel):
	kwargs = {}
	if excel != "default":
	kwargs["excel"] = excel
	if sep != "default":
	kwargs["sep"] = sep
	return kwargs


	@pytest.fixture(
	params=[
	"delims",
	"utf8",
	"utf16",
	"string",
	"long",
	"nonascii",
	"colwidth",
	"mixed",
	"float",
	"int",
	]
	)
	def df(request):
	data_type = request.param

	if data_type == "delims":
	return DataFrame({"a": ['"a,\t"b\|c', "d\tef`"], "b": ["hi'j", "k''lm"]})
	elif data_type == "utf8":
	return DataFrame({"a": ["µasd", "Ωœ∑`"], "b": ["øπ∆˚¬", "œ∑`®"]})
	elif data_type == "utf16":
	return DataFrame(
	{"a": ["\U0001f44d\U0001f44d", "\U0001f44d\U0001f44d"], "b": ["abc", "def"]}
	)
	elif data_type == "string":
	return DataFrame(
	np.array([f"i-{i}" for i in range(15)]).reshape(5, 3), columns=list("abc")
	)
	elif data_type == "long":
	max_rows = get_option("display.max_rows")
	return DataFrame(
	np.random.default_rng(2).integers(0, 10, size=(max_rows + 1, 3)),
	columns=list("abc"),
	)
	elif data_type == "nonascii":
	return DataFrame({"en": "in English".split(), "es": "en español".split()})
	elif data_type == "colwidth":
	_cw = get_option("display.max_colwidth") + 1
	return DataFrame(
	np.array(["x" * _cw for _ in range(15)]).reshape(5, 3), columns=list("abc")
	)
	elif data_type == "mixed":
	return DataFrame(
	{
	"a": np.arange(1.0, 6.0) + 0.01,
	"b": np.arange(1, 6).astype(np.int64),
	"c": list("abcde"),
	}
	)
	elif data_type == "float":
	return DataFrame(np.random.default_rng(2).random((5, 3)), columns=list("abc"))
	elif data_type == "int":
	return DataFrame(
	np.random.default_rng(2).integers(0, 10, (5, 3)), columns=list("abc")
	)
	else:
	raise ValueError


	@pytest.fixture
	def mock_ctypes(monkeypatch):
	"""
	Mocks WinError to help with testing the clipboard.
	"""

	def _mock_win_error():
	return "Window Error"

	# Set raising to False because WinError won't exist on non-windows platforms
	with monkeypatch.context() as m:
	m.setattr("ctypes.WinError", _mock_win_error, raising=False)
	yield


	@pytest.mark.usefixtures("mock_ctypes")
	def test_checked_call_with_bad_call(monkeypatch):
	"""
	Give CheckCall a function that returns a falsey value and
	mock get_errno so it returns false so an exception is raised.
	"""

	def _return_false():
	return False

	monkeypatch.setattr("pandas.io.clipboard.get_errno", lambda: True)
	msg = f"Error calling {_return_false.__name__} \\(Window Error\\)"

	with pytest.raises(PyperclipWindowsException, match=msg):
	CheckedCall(_return_false)()


	@pytest.mark.usefixtures("mock_ctypes")
	def test_checked_call_with_valid_call(monkeypatch):
	"""
	Give CheckCall a function that returns a truthy value and
	mock get_errno so it returns true so an exception is not raised.
	The function should return the results from _return_true.
	"""

	def _return_true():
	return True

	monkeypatch.setattr("pandas.io.clipboard.get_errno", lambda: False)

	# Give CheckedCall a callable that returns a truthy value s
	checked_call = CheckedCall(_return_true)
	assert checked_call() is True


	@pytest.mark.parametrize(
	"text",
	[
	"String_test",
	True,
	1,
	1.0,
	1j,
	],
	)
	def test_stringify_text(text):
	valid_types = (str, int, float, bool)

	if isinstance(text, valid_types):
	result = _stringifyText(text)
	assert result == str(text)
	else:
	msg = (
	"only str, int, float, and bool values "
	f"can be copied to the clipboard, not {type(text).__name__}"
	)
	with pytest.raises(PyperclipException, match=msg):
	_stringifyText(text)


	@pytest.fixture
	def set_pyqt_clipboard(monkeypatch):
	qt_cut, qt_paste = init_qt_clipboard()
	with monkeypatch.context() as m:
	m.setattr(pd.io.clipboard, "clipboard_set", qt_cut)
	m.setattr(pd.io.clipboard, "clipboard_get", qt_paste)
	yield


	@pytest.fixture
	def clipboard(qapp):
	clip = qapp.clipboard()
	yield clip
	clip.clear()


	@pytest.mark.single_cpu
	@pytest.mark.clipboard
	@pytest.mark.usefixtures("set_pyqt_clipboard")
	@pytest.mark.usefixtures("clipboard")
	class TestClipboard:
	# Test that default arguments copy as tab delimited
	# Test that explicit delimiters are respected
	@pytest.mark.parametrize("sep", [None, "\t", ",", "\|"])
	@pytest.mark.parametrize("encoding", [None, "UTF-8", "utf-8", "utf8"])
	def test_round_trip_frame_sep(self, df, sep, encoding):
	df.to_clipboard(excel=None, sep=sep, encoding=encoding)
	result = read_clipboard(sep=sep or "\t", index_col=0, encoding=encoding)
	tm.assert_frame_equal(df, result)

	# Test white space separator
	def test_round_trip_frame_string(self, df):
	df.to_clipboard(excel=False, sep=None)
	result = read_clipboard()
	assert df.to_string() == result.to_string()
	assert df.shape == result.shape

	# Two character separator is not supported in to_clipboard
	# Test that multi-character separators are not silently passed
	def test_excel_sep_warning(self, df):
	with tm.assert_produces_warning(
	UserWarning,
	match="to_clipboard in excel mode requires a single character separator.",
	check_stacklevel=False,
	):
	df.to_clipboard(excel=True, sep=r"\t")

	# Separator is ignored when excel=False and should produce a warning
	def test_copy_delim_warning(self, df):
	with tm.assert_produces_warning():
	df.to_clipboard(excel=False, sep="\t")

	# Tests that the default behavior of to_clipboard is tab
	# delimited and excel="True"
	@pytest.mark.parametrize("sep", ["\t", None, "default"])
	@pytest.mark.parametrize("excel", [True, None, "default"])
	def test_clipboard_copy_tabs_default(self, sep, excel, df, clipboard):
	kwargs = build_kwargs(sep, excel)
	df.to_clipboard(**kwargs)
	assert clipboard.text() == df.to_csv(sep="\t")

	# Tests reading of white space separated tables
	@pytest.mark.parametrize("sep", [None, "default"])
	def test_clipboard_copy_strings(self, sep, df):
	kwargs = build_kwargs(sep, False)
	df.to_clipboard(**kwargs)
	result = read_clipboard(sep=r"\s+")
	assert result.to_string() == df.to_string()
	assert df.shape == result.shape

	def test_read_clipboard_infer_excel(self, clipboard):
	# gh-19010: avoid warnings
	clip_kwargs = {"engine": "python"}

	text = dedent(
	"""
	John James\tCharlie Mingus
	1\t2
	4\tHarry Carney
	""".strip()
	)
	clipboard.setText(text)
	df = read_clipboard(**clip_kwargs)

	# excel data is parsed correctly
	assert df.iloc[1, 1] == "Harry Carney"

	# having diff tab counts doesn't trigger it
	text = dedent(
	"""
	a\t b
	1 2
	3 4
	""".strip()
	)
	clipboard.setText(text)
	res = read_clipboard(**clip_kwargs)

	text = dedent(
	"""
	a b
	1 2
	3 4
	""".strip()
	)
	clipboard.setText(text)
	exp = read_clipboard(**clip_kwargs)

	tm.assert_frame_equal(res, exp)

	def test_infer_excel_with_nulls(self, clipboard):
	# GH41108
	text = "col1\tcol2\n1\tred\n\tblue\n2\tgreen"

	clipboard.setText(text)
	df = read_clipboard()
	df_expected = DataFrame(
	data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]}
	)

	# excel data is parsed correctly
	tm.assert_frame_equal(df, df_expected)

	@pytest.mark.parametrize(
	"multiindex",
	[
	( # Can't use `dedent` here as it will remove the leading `\t`
	"\n".join(
	[
	"\t\t\tcol1\tcol2",
	"A\t0\tTrue\t1\tred",
	"A\t1\tTrue\t\tblue",
	"B\t0\tFalse\t2\tgreen",
	]
	),
	[["A", "A", "B"], [0, 1, 0], [True, True, False]],
	),
	(
	"\n".join(
	["\t\tcol1\tcol2", "A\t0\t1\tred", "A\t1\t\tblue", "B\t0\t2\tgreen"]
	),
	[["A", "A", "B"], [0, 1, 0]],
	),
	],
	)
	def test_infer_excel_with_multiindex(self, clipboard, multiindex):
	# GH41108

	clipboard.setText(multiindex[0])
	df = read_clipboard()
	df_expected = DataFrame(
	data={"col1": [1, None, 2], "col2": ["red", "blue", "green"]},
	index=multiindex[1],
	)

	# excel data is parsed correctly
	tm.assert_frame_equal(df, df_expected)

	def test_invalid_encoding(self, df):
	msg = "clipboard only supports utf-8 encoding"
	# test case for testing invalid encoding
	with pytest.raises(ValueError, match=msg):
	df.to_clipboard(encoding="ascii")
	with pytest.raises(NotImplementedError, match=msg):
	read_clipboard(encoding="ascii")

	@pytest.mark.parametrize("data", ["\U0001f44d...", "Ωœ∑`...", "abcd..."])
	def test_raw_roundtrip(self, data):
	# PR #25040 wide unicode wasn't copied correctly on PY3 on windows
	df = DataFrame({"data": [data]})
	df.to_clipboard()
	result = read_clipboard()
	tm.assert_frame_equal(df, result)

	@pytest.mark.parametrize("engine", ["c", "python"])
	def test_read_clipboard_dtype_backend(
	self, clipboard, string_storage, dtype_backend, engine
	):
	# GH#50502
	if string_storage == "pyarrow" or dtype_backend == "pyarrow":
	pa = pytest.importorskip("pyarrow")

	if string_storage == "python":
	string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
	string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))

	elif dtype_backend == "pyarrow" and engine != "c":
	pa = pytest.importorskip("pyarrow")
	from pandas.arrays import ArrowExtensionArray

	string_array = ArrowExtensionArray(pa.array(["x", "y"]))
	string_array_na = ArrowExtensionArray(pa.array(["x", None]))

	else:
	string_array = ArrowStringArray(pa.array(["x", "y"]))
	string_array_na = ArrowStringArray(pa.array(["x", None]))

	text = """a,b,c,d,e,f,g,h,i
	x,1,4.0,x,2,4.0,,True,False
	y,2,5.0,,,,,False,"""
	clipboard.setText(text)

	with pd.option_context("mode.string_storage", string_storage):
	result = read_clipboard(sep=",", dtype_backend=dtype_backend, engine=engine)

	expected = DataFrame(
	{
	"a": string_array,
	"b": Series([1, 2], dtype="Int64"),
	"c": Series([4.0, 5.0], dtype="Float64"),
	"d": string_array_na,
	"e": Series([2, NA], dtype="Int64"),
	"f": Series([4.0, NA], dtype="Float64"),
	"g": Series([NA, NA], dtype="Int64"),
	"h": Series([True, False], dtype="boolean"),
	"i": Series([False, NA], dtype="boolean"),
	}
	)
	if dtype_backend == "pyarrow":
	from pandas.arrays import ArrowExtensionArray

	expected = DataFrame(
	{
	col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
	for col in expected.columns
	}
	)
	expected["g"] = ArrowExtensionArray(pa.array([None, None]))

	tm.assert_frame_equal(result, expected)

	def test_invalid_dtype_backend(self):
	msg = (
	"dtype_backend numpy is invalid, only 'numpy_nullable' and "
	"'pyarrow' are allowed."
	)
	with pytest.raises(ValueError, match=msg):
	read_clipboard(dtype_backend="numpy")

	def test_to_clipboard_pos_args_deprecation(self):
	# GH-54229
	df = DataFrame({"a": [1, 2, 3]})
	msg = (
	r"Starting with pandas version 3.0 all arguments of to_clipboard "
	r"will be keyword-only."
	)
	with tm.assert_produces_warning(FutureWarning, match=msg):
	df.to_clipboard(True, None)