|
from __future__ import annotations |
|
|
|
from datetime import ( |
|
date, |
|
datetime, |
|
time, |
|
timedelta, |
|
) |
|
from typing import ( |
|
TYPE_CHECKING, |
|
Any, |
|
Union, |
|
) |
|
|
|
from pandas.compat._optional import import_optional_dependency |
|
from pandas.util._decorators import doc |
|
|
|
import pandas as pd |
|
from pandas.core.shared_docs import _shared_docs |
|
|
|
from pandas.io.excel._base import BaseExcelReader |
|
|
|
if TYPE_CHECKING: |
|
from python_calamine import ( |
|
CalamineSheet, |
|
CalamineWorkbook, |
|
) |
|
|
|
from pandas._typing import ( |
|
FilePath, |
|
NaTType, |
|
ReadBuffer, |
|
Scalar, |
|
StorageOptions, |
|
) |
|
|
|
_CellValue = Union[int, float, str, bool, time, date, datetime, timedelta] |
|
|
|
|
|
class CalamineReader(BaseExcelReader["CalamineWorkbook"]): |
|
@doc(storage_options=_shared_docs["storage_options"]) |
|
def __init__( |
|
self, |
|
filepath_or_buffer: FilePath | ReadBuffer[bytes], |
|
storage_options: StorageOptions | None = None, |
|
engine_kwargs: dict | None = None, |
|
) -> None: |
|
""" |
|
Reader using calamine engine (xlsx/xls/xlsb/ods). |
|
|
|
Parameters |
|
---------- |
|
filepath_or_buffer : str, path to be parsed or |
|
an open readable stream. |
|
{storage_options} |
|
engine_kwargs : dict, optional |
|
Arbitrary keyword arguments passed to excel engine. |
|
""" |
|
import_optional_dependency("python_calamine") |
|
super().__init__( |
|
filepath_or_buffer, |
|
storage_options=storage_options, |
|
engine_kwargs=engine_kwargs, |
|
) |
|
|
|
@property |
|
def _workbook_class(self) -> type[CalamineWorkbook]: |
|
from python_calamine import CalamineWorkbook |
|
|
|
return CalamineWorkbook |
|
|
|
def load_workbook( |
|
self, filepath_or_buffer: FilePath | ReadBuffer[bytes], engine_kwargs: Any |
|
) -> CalamineWorkbook: |
|
from python_calamine import load_workbook |
|
|
|
return load_workbook(filepath_or_buffer, **engine_kwargs) |
|
|
|
@property |
|
def sheet_names(self) -> list[str]: |
|
from python_calamine import SheetTypeEnum |
|
|
|
return [ |
|
sheet.name |
|
for sheet in self.book.sheets_metadata |
|
if sheet.typ == SheetTypeEnum.WorkSheet |
|
] |
|
|
|
def get_sheet_by_name(self, name: str) -> CalamineSheet: |
|
self.raise_if_bad_sheet_by_name(name) |
|
return self.book.get_sheet_by_name(name) |
|
|
|
def get_sheet_by_index(self, index: int) -> CalamineSheet: |
|
self.raise_if_bad_sheet_by_index(index) |
|
return self.book.get_sheet_by_index(index) |
|
|
|
def get_sheet_data( |
|
self, sheet: CalamineSheet, file_rows_needed: int | None = None |
|
) -> list[list[Scalar | NaTType | time]]: |
|
def _convert_cell(value: _CellValue) -> Scalar | NaTType | time: |
|
if isinstance(value, float): |
|
val = int(value) |
|
if val == value: |
|
return val |
|
else: |
|
return value |
|
elif isinstance(value, date): |
|
return pd.Timestamp(value) |
|
elif isinstance(value, timedelta): |
|
return pd.Timedelta(value) |
|
elif isinstance(value, time): |
|
return value |
|
|
|
return value |
|
|
|
rows: list[list[_CellValue]] = sheet.to_python( |
|
skip_empty_area=False, nrows=file_rows_needed |
|
) |
|
data = [[_convert_cell(cell) for cell in row] for row in rows] |
|
|
|
return data |
|
|