Spaces:
Sleeping
Sleeping
from pdf2image import convert_from_path | |
import pytesseract | |
from PyPDF2 import PdfReader | |
import tempfile | |
import os | |
def extract_text_simple(pdf_path: str) -> str: | |
"""使用 PyPDF2 直接提取 PDF 純文字""" | |
try: | |
with open(pdf_path, "rb") as f: | |
reader = PdfReader(f) | |
return "\n\n".join(page.extract_text() or "" for page in reader.pages) | |
except Exception as e: | |
return f"❌ PDF 讀取錯誤: {e}" | |
def extract_text_ocr(pdf_path: str) -> str: | |
"""使用 OCR 擷取 PDF 的圖片並辨識成文字""" | |
try: | |
images = convert_from_path(pdf_path, dpi=300) | |
text = "" | |
for i, img in enumerate(images): | |
gray = img.convert('L') | |
page_text = pytesseract.image_to_string(gray, lang='chi_tra') | |
text += f"\n\n--- Page {i+1} ---\n\n" + page_text | |
return text | |
except Exception as e: | |
return f"❌ OCR 擷取失敗: {e}" | |
def extract_text(pdf_path: str, mode: str = "simple") -> str: | |
"""依模式選擇擷取方式:simple 或 ocr""" | |
if mode == "ocr": | |
return extract_text_ocr(pdf_path) | |
return extract_text_simple(pdf_path) | |