File size: 4,443 Bytes
7885a28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
""" feather-format compat """
from __future__ import annotations
from typing import (
TYPE_CHECKING,
Any,
)
from pandas._config import using_pyarrow_string_dtype
from pandas._libs import lib
from pandas.compat._optional import import_optional_dependency
from pandas.util._decorators import doc
from pandas.util._validators import check_dtype_backend
import pandas as pd
from pandas.core.api import DataFrame
from pandas.core.shared_docs import _shared_docs
from pandas.io._util import arrow_string_types_mapper
from pandas.io.common import get_handle
if TYPE_CHECKING:
from collections.abc import (
Hashable,
Sequence,
)
from pandas._typing import (
DtypeBackend,
FilePath,
ReadBuffer,
StorageOptions,
WriteBuffer,
)
@doc(storage_options=_shared_docs["storage_options"])
def to_feather(
df: DataFrame,
path: FilePath | WriteBuffer[bytes],
storage_options: StorageOptions | None = None,
**kwargs: Any,
) -> None:
"""
Write a DataFrame to the binary Feather format.
Parameters
----------
df : DataFrame
path : str, path object, or file-like object
{storage_options}
**kwargs :
Additional keywords passed to `pyarrow.feather.write_feather`.
"""
import_optional_dependency("pyarrow")
from pyarrow import feather
if not isinstance(df, DataFrame):
raise ValueError("feather only support IO with DataFrames")
with get_handle(
path, "wb", storage_options=storage_options, is_text=False
) as handles:
feather.write_feather(df, handles.handle, **kwargs)
@doc(storage_options=_shared_docs["storage_options"])
def read_feather(
path: FilePath | ReadBuffer[bytes],
columns: Sequence[Hashable] | None = None,
use_threads: bool = True,
storage_options: StorageOptions | None = None,
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
) -> DataFrame:
"""
Load a feather-format object from the file path.
Parameters
----------
path : str, path object, or file-like object
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a binary ``read()`` function. The string could be a URL.
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be: ``file://localhost/path/to/table.feather``.
columns : sequence, default None
If not provided, all columns are read.
use_threads : bool, default True
Whether to parallelize reading using multiple threads.
{storage_options}
dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable'
Back-end data type applied to the resultant :class:`DataFrame`
(still experimental). Behaviour is as follows:
* ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame`
(default).
* ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype`
DataFrame.
.. versionadded:: 2.0
Returns
-------
type of object stored in file
Examples
--------
>>> df = pd.read_feather("path/to/file.feather") # doctest: +SKIP
"""
import_optional_dependency("pyarrow")
from pyarrow import feather
# import utils to register the pyarrow extension types
import pandas.core.arrays.arrow.extension_types # pyright: ignore[reportUnusedImport] # noqa: F401
check_dtype_backend(dtype_backend)
with get_handle(
path, "rb", storage_options=storage_options, is_text=False
) as handles:
if dtype_backend is lib.no_default and not using_pyarrow_string_dtype():
return feather.read_feather(
handles.handle, columns=columns, use_threads=bool(use_threads)
)
pa_table = feather.read_table(
handles.handle, columns=columns, use_threads=bool(use_threads)
)
if dtype_backend == "numpy_nullable":
from pandas.io._util import _arrow_dtype_mapping
return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
elif dtype_backend == "pyarrow":
return pa_table.to_pandas(types_mapper=pd.ArrowDtype)
elif using_pyarrow_string_dtype():
return pa_table.to_pandas(types_mapper=arrow_string_types_mapper())
else:
raise NotImplementedError
|