Spaces:
Sleeping
Sleeping
import os | |
import pytesseract | |
from PyPDF2 import PdfReader | |
from pdf2image import convert_from_path | |
from typing import Literal | |
def extract_text_simple(pdf_path: str) -> str: | |
""" | |
使用 PyPDF2 解析 PDF 純文字 | |
""" | |
reader = PdfReader(pdf_path) | |
all_text = [] | |
for page in reader.pages: | |
text = page.extract_text() | |
if text: | |
all_text.append(text.strip()) | |
return "\n".join(all_text) | |
def extract_text_ocr(pdf_path: str, dpi: int = 300) -> str: | |
""" | |
使用 Tesseract OCR 提取圖片形式的 PDF 內容 | |
""" | |
images = convert_from_path(pdf_path, dpi=dpi) | |
all_text = [] | |
for img in images: | |
text = pytesseract.image_to_string(img, lang="chi_tra+eng") | |
if text: | |
all_text.append(text.strip()) | |
return "\n".join(all_text) | |
def extract_text(pdf_path: str, mode: Literal["simple", "ocr"] = "simple") -> str: | |
""" | |
根據模式選擇提取方法 | |
""" | |
if mode == "ocr": | |
return extract_text_ocr(pdf_path) | |
else: | |
return extract_text_simple(pdf_path) | |
# 為 app.py 提供相容介面 | |
def convert_PDF_to_Text(pdf_path: str, ocr_model=None, max_pages: int = 20) -> dict: | |
""" | |
模擬 app.py 所需的 convert_PDF_to_Text 介面 | |
""" | |
text = extract_text(pdf_path, mode="ocr" if ocr_model else "simple") | |
return { | |
"converted_text": text, | |
"source_path": pdf_path, | |
"used_ocr": bool(ocr_model), | |
"page_count": "N/A", | |
} | |
convert_pdf_to_text = convert_PDF_to_Text | |