|
""" orc compat """ |
|
from __future__ import annotations |
|
|
|
import io |
|
from types import ModuleType |
|
from typing import ( |
|
TYPE_CHECKING, |
|
Any, |
|
Literal, |
|
) |
|
|
|
from pandas._config import using_pyarrow_string_dtype |
|
|
|
from pandas._libs import lib |
|
from pandas.compat._optional import import_optional_dependency |
|
from pandas.util._validators import check_dtype_backend |
|
|
|
import pandas as pd |
|
from pandas.core.indexes.api import default_index |
|
|
|
from pandas.io._util import arrow_string_types_mapper |
|
from pandas.io.common import ( |
|
get_handle, |
|
is_fsspec_url, |
|
) |
|
|
|
if TYPE_CHECKING: |
|
import fsspec |
|
import pyarrow.fs |
|
|
|
from pandas._typing import ( |
|
DtypeBackend, |
|
FilePath, |
|
ReadBuffer, |
|
WriteBuffer, |
|
) |
|
|
|
from pandas.core.frame import DataFrame |
|
|
|
|
|
def read_orc( |
|
path: FilePath | ReadBuffer[bytes], |
|
columns: list[str] | None = None, |
|
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, |
|
filesystem: pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None = None, |
|
**kwargs: Any, |
|
) -> DataFrame: |
|
""" |
|
Load an ORC object from the file path, returning a DataFrame. |
|
|
|
Parameters |
|
---------- |
|
path : str, path object, or file-like object |
|
String, path object (implementing ``os.PathLike[str]``), or file-like |
|
object implementing a binary ``read()`` function. The string could be a URL. |
|
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is |
|
expected. A local file could be: |
|
``file://localhost/path/to/table.orc``. |
|
columns : list, default None |
|
If not None, only these columns will be read from the file. |
|
Output always follows the ordering of the file and not the columns list. |
|
This mirrors the original behaviour of |
|
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`. |
|
dtype_backend : {'numpy_nullable', 'pyarrow'}, default 'numpy_nullable' |
|
Back-end data type applied to the resultant :class:`DataFrame` |
|
(still experimental). Behaviour is as follows: |
|
|
|
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` |
|
(default). |
|
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` |
|
DataFrame. |
|
|
|
.. versionadded:: 2.0 |
|
|
|
filesystem : fsspec or pyarrow filesystem, default None |
|
Filesystem object to use when reading the parquet file. |
|
|
|
.. versionadded:: 2.1.0 |
|
|
|
**kwargs |
|
Any additional kwargs are passed to pyarrow. |
|
|
|
Returns |
|
------- |
|
DataFrame |
|
|
|
Notes |
|
----- |
|
Before using this function you should read the :ref:`user guide about ORC <io.orc>` |
|
and :ref:`install optional dependencies <install.warn_orc>`. |
|
|
|
If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"), |
|
a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a |
|
pyarrow or fsspec filesystem object into the filesystem keyword to override this |
|
behavior. |
|
|
|
Examples |
|
-------- |
|
>>> result = pd.read_orc("example_pa.orc") # doctest: +SKIP |
|
""" |
|
|
|
|
|
orc = import_optional_dependency("pyarrow.orc") |
|
|
|
check_dtype_backend(dtype_backend) |
|
|
|
with get_handle(path, "rb", is_text=False) as handles: |
|
source = handles.handle |
|
if is_fsspec_url(path) and filesystem is None: |
|
pa = import_optional_dependency("pyarrow") |
|
pa_fs = import_optional_dependency("pyarrow.fs") |
|
try: |
|
filesystem, source = pa_fs.FileSystem.from_uri(path) |
|
except (TypeError, pa.ArrowInvalid): |
|
pass |
|
|
|
pa_table = orc.read_table( |
|
source=source, columns=columns, filesystem=filesystem, **kwargs |
|
) |
|
if dtype_backend is not lib.no_default: |
|
if dtype_backend == "pyarrow": |
|
df = pa_table.to_pandas(types_mapper=pd.ArrowDtype) |
|
else: |
|
from pandas.io._util import _arrow_dtype_mapping |
|
|
|
mapping = _arrow_dtype_mapping() |
|
df = pa_table.to_pandas(types_mapper=mapping.get) |
|
return df |
|
else: |
|
if using_pyarrow_string_dtype(): |
|
types_mapper = arrow_string_types_mapper() |
|
else: |
|
types_mapper = None |
|
return pa_table.to_pandas(types_mapper=types_mapper) |
|
|
|
|
|
def to_orc( |
|
df: DataFrame, |
|
path: FilePath | WriteBuffer[bytes] | None = None, |
|
*, |
|
engine: Literal["pyarrow"] = "pyarrow", |
|
index: bool | None = None, |
|
engine_kwargs: dict[str, Any] | None = None, |
|
) -> bytes | None: |
|
""" |
|
Write a DataFrame to the ORC format. |
|
|
|
.. versionadded:: 1.5.0 |
|
|
|
Parameters |
|
---------- |
|
df : DataFrame |
|
The dataframe to be written to ORC. Raises NotImplementedError |
|
if dtype of one or more columns is category, unsigned integers, |
|
intervals, periods or sparse. |
|
path : str, file-like object or None, default None |
|
If a string, it will be used as Root Directory path |
|
when writing a partitioned dataset. By file-like object, |
|
we refer to objects with a write() method, such as a file handle |
|
(e.g. via builtin open function). If path is None, |
|
a bytes object is returned. |
|
engine : str, default 'pyarrow' |
|
ORC library to use. |
|
index : bool, optional |
|
If ``True``, include the dataframe's index(es) in the file output. If |
|
``False``, they will not be written to the file. |
|
If ``None``, similar to ``infer`` the dataframe's index(es) |
|
will be saved. However, instead of being saved as values, |
|
the RangeIndex will be stored as a range in the metadata so it |
|
doesn't require much space and is faster. Other indexes will |
|
be included as columns in the file output. |
|
engine_kwargs : dict[str, Any] or None, default None |
|
Additional keyword arguments passed to :func:`pyarrow.orc.write_table`. |
|
|
|
Returns |
|
------- |
|
bytes if no path argument is provided else None |
|
|
|
Raises |
|
------ |
|
NotImplementedError |
|
Dtype of one or more columns is category, unsigned integers, interval, |
|
period or sparse. |
|
ValueError |
|
engine is not pyarrow. |
|
|
|
Notes |
|
----- |
|
* Before using this function you should read the |
|
:ref:`user guide about ORC <io.orc>` and |
|
:ref:`install optional dependencies <install.warn_orc>`. |
|
* This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_ |
|
library. |
|
* For supported dtypes please refer to `supported ORC features in Arrow |
|
<https://arrow.apache.org/docs/cpp/orc.html#data-types>`__. |
|
* Currently timezones in datetime columns are not preserved when a |
|
dataframe is converted into ORC files. |
|
""" |
|
if index is None: |
|
index = df.index.names[0] is not None |
|
if engine_kwargs is None: |
|
engine_kwargs = {} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if not df.index.equals(default_index(len(df))): |
|
raise ValueError( |
|
"orc does not support serializing a non-default index for the index; " |
|
"you can .reset_index() to make the index into column(s)" |
|
) |
|
|
|
if df.index.name is not None: |
|
raise ValueError("orc does not serialize index meta-data on a default index") |
|
|
|
if engine != "pyarrow": |
|
raise ValueError("engine must be 'pyarrow'") |
|
engine = import_optional_dependency(engine, min_version="10.0.1") |
|
pa = import_optional_dependency("pyarrow") |
|
orc = import_optional_dependency("pyarrow.orc") |
|
|
|
was_none = path is None |
|
if was_none: |
|
path = io.BytesIO() |
|
assert path is not None |
|
with get_handle(path, "wb", is_text=False) as handles: |
|
assert isinstance(engine, ModuleType) |
|
try: |
|
orc.write_table( |
|
engine.Table.from_pandas(df, preserve_index=index), |
|
handles.handle, |
|
**engine_kwargs, |
|
) |
|
except (TypeError, pa.ArrowNotImplementedError) as e: |
|
raise NotImplementedError( |
|
"The dtype of one or more columns is not supported yet." |
|
) from e |
|
|
|
if was_none: |
|
assert isinstance(path, io.BytesIO) |
|
return path.getvalue() |
|
return None |
|
|