Sam Chaudry

Upload folder using huggingface_hub

7885a28 verified about 1 month ago

8.39 kB

	""" orc compat """
	from __future__ import annotations

	import io
	from types import ModuleType
	from typing import (
	TYPE_CHECKING,
	Any,
	Literal,
	)

	from pandas._config import using_pyarrow_string_dtype

	from pandas._libs import lib
	from pandas.compat._optional import import_optional_dependency
	from pandas.util._validators import check_dtype_backend

	import pandas as pd
	from pandas.core.indexes.api import default_index

	from pandas.io._util import arrow_string_types_mapper
	from pandas.io.common import (
	get_handle,
	is_fsspec_url,
	)

	if TYPE_CHECKING:
	import fsspec
	import pyarrow.fs

	from pandas._typing import (
	DtypeBackend,
	FilePath,
	ReadBuffer,
	WriteBuffer,
	)

	from pandas.core.frame import DataFrame


	def read_orc(
	path: FilePath \| ReadBuffer[bytes],
	columns: list[str] \| None = None,
	dtype_backend: DtypeBackend \| lib.NoDefault = lib.no_default,
	filesystem: pyarrow.fs.FileSystem \| fsspec.spec.AbstractFileSystem \| None = None,
	**kwargs: Any,
	) -> DataFrame:
	"""
	Load an ORC object from the file path, returning a DataFrame.

	Parameters
	----------
	path : str, path object, or file-like object
	String, path object (implementing ``os.PathLike[str]``), or file-like
	object implementing a binary ``read()`` function. The string could be a URL.
	Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
	expected. A local file could be:
	``file://localhost/path/to/table.orc``.
	columns : list, default None
	If not None, only these columns will be read from the file.
	Output always follows the ordering of the file and not the columns list.
	This mirrors the original behaviour of
	:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
	dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable'
	Back-end data type applied to the resultant :class:`DataFrame`
	(still experimental). Behaviour is as follows:

	* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
	(default).
	* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
	DataFrame.

	.. versionadded:: 2.0

	filesystem : fsspec or pyarrow filesystem, default None
	Filesystem object to use when reading the parquet file.

	.. versionadded:: 2.1.0

	**kwargs
	Any additional kwargs are passed to pyarrow.

	Returns
	-------
	DataFrame

	Notes
	-----
	Before using this function you should read the :ref:`user guide about ORC <io.orc>`
	and :ref:`install optional dependencies <install.warn_orc>`.

	If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
	a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
	pyarrow or fsspec filesystem object into the filesystem keyword to override this
	behavior.

	Examples
	--------
	>>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP
	"""
	# we require a newer version of pyarrow than we support for parquet

	orc = import_optional_dependency("pyarrow.orc")

	check_dtype_backend(dtype_backend)

	with get_handle(path, "rb", is_text=False) as handles:
	source = handles.handle
	if is_fsspec_url(path) and filesystem is None:
	pa = import_optional_dependency("pyarrow")
	pa_fs = import_optional_dependency("pyarrow.fs")
	try:
	filesystem, source = pa_fs.FileSystem.from_uri(path)
	except (TypeError, pa.ArrowInvalid):
	pass

	pa_table = orc.read_table(
	source=source, columns=columns, filesystem=filesystem, **kwargs
	)
	if dtype_backend is not lib.no_default:
	if dtype_backend == "pyarrow":
	df = pa_table.to_pandas(types_mapper=pd.ArrowDtype)
	else:
	from pandas.io._util import _arrow_dtype_mapping

	mapping = _arrow_dtype_mapping()
	df = pa_table.to_pandas(types_mapper=mapping.get)
	return df
	else:
	if using_pyarrow_string_dtype():
	types_mapper = arrow_string_types_mapper()
	else:
	types_mapper = None
	return pa_table.to_pandas(types_mapper=types_mapper)


	def to_orc(
	df: DataFrame,
	path: FilePath \| WriteBuffer[bytes] \| None = None,
	*,
	engine: Literal["pyarrow"] = "pyarrow",
	index: bool \| None = None,
	engine_kwargs: dict[str, Any] \| None = None,
	) -> bytes \| None:
	"""
	Write a DataFrame to the ORC format.

	.. versionadded:: 1.5.0

	Parameters
	----------
	df : DataFrame
	The dataframe to be written to ORC. Raises NotImplementedError
	if dtype of one or more columns is category, unsigned integers,
	intervals, periods or sparse.
	path : str, file-like object or None, default None
	If a string, it will be used as Root Directory path
	when writing a partitioned dataset. By file-like object,
	we refer to objects with a write() method, such as a file handle
	(e.g. via builtin open function). If path is None,
	a bytes object is returned.
	engine : str, default 'pyarrow'
	ORC library to use.
	index : bool, optional
	If ``True``, include the dataframe's index(es) in the file output. If
	``False``, they will not be written to the file.
	If ``None``, similar to ``infer`` the dataframe's index(es)
	will be saved. However, instead of being saved as values,
	the RangeIndex will be stored as a range in the metadata so it
	doesn't require much space and is faster. Other indexes will
	be included as columns in the file output.
	engine_kwargs : dict[str, Any] or None, default None
	Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.

	Returns
	-------
	bytes if no path argument is provided else None

	Raises
	------
	NotImplementedError
	Dtype of one or more columns is category, unsigned integers, interval,
	period or sparse.
	ValueError
	engine is not pyarrow.

	Notes
	-----
	* Before using this function you should read the
	:ref:`user guide about ORC <io.orc>` and
	:ref:`install optional dependencies <install.warn_orc>`.
	* This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
	library.
	* For supported dtypes please refer to `supported ORC features in Arrow
	<https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
	* Currently timezones in datetime columns are not preserved when a
	dataframe is converted into ORC files.
	"""
	if index is None:
	index = df.index.names[0] is not None
	if engine_kwargs is None:
	engine_kwargs = {}

	# validate index
	# --------------

	# validate that we have only a default index
	# raise on anything else as we don't serialize the index

	if not df.index.equals(default_index(len(df))):
	raise ValueError(
	"orc does not support serializing a non-default index for the index; "
	"you can .reset_index() to make the index into column(s)"
	)

	if df.index.name is not None:
	raise ValueError("orc does not serialize index meta-data on a default index")

	if engine != "pyarrow":
	raise ValueError("engine must be 'pyarrow'")
	engine = import_optional_dependency(engine, min_version="10.0.1")
	pa = import_optional_dependency("pyarrow")
	orc = import_optional_dependency("pyarrow.orc")

	was_none = path is None
	if was_none:
	path = io.BytesIO()
	assert path is not None # For mypy
	with get_handle(path, "wb", is_text=False) as handles:
	assert isinstance(engine, ModuleType) # For mypy
	try:
	orc.write_table(
	engine.Table.from_pandas(df, preserve_index=index),
	handles.handle,
	**engine_kwargs,
	)
	except (TypeError, pa.ArrowNotImplementedError) as e:
	raise NotImplementedError(
	"The dtype of one or more columns is not supported yet."
	) from e

	if was_none:
	assert isinstance(path, io.BytesIO) # For mypy
	return path.getvalue()
	return None