import os import pytesseract from PyPDF2 import PdfReader from pdf2image import convert_from_path from typing import Literal def extract_text_simple(pdf_path: str) -> str: """ 使用 PyPDF2 解析 PDF 純文字 """ reader = PdfReader(pdf_path) all_text = [] for page in reader.pages: text = page.extract_text() if text: all_text.append(text.strip()) return "\n".join(all_text) def extract_text_ocr(pdf_path: str, dpi: int = 300) -> str: """ 使用 Tesseract OCR 提取圖片形式的 PDF 內容 """ images = convert_from_path(pdf_path, dpi=dpi) all_text = [] for img in images: text = pytesseract.image_to_string(img, lang="chi_tra+eng") if text: all_text.append(text.strip()) return "\n".join(all_text) def extract_text(pdf_path: str, mode: Literal["simple", "ocr"] = "simple") -> str: """ 根據模式選擇提取方法 """ if mode == "ocr": return extract_text_ocr(pdf_path) else: return extract_text_simple(pdf_path) # 為 app.py 提供相容介面 def convert_PDF_to_Text(pdf_path: str, ocr_model=None, max_pages: int = 20) -> dict: """ 模擬 app.py 所需的 convert_PDF_to_Text 介面 """ text = extract_text(pdf_path, mode="ocr" if ocr_model else "simple") return { "converted_text": text, "source_path": pdf_path, "used_ocr": bool(ocr_model), "page_count": "N/A", } convert_pdf_to_text = convert_PDF_to_Text