from pdf2image import convert_from_path import pytesseract from PyPDF2 import PdfReader import tempfile import os def extract_text_simple(pdf_path: str) -> str: """使用 PyPDF2 直接提取 PDF 純文字""" try: with open(pdf_path, "rb") as f: reader = PdfReader(f) return "\n\n".join(page.extract_text() or "" for page in reader.pages) except Exception as e: return f"❌ PDF 讀取錯誤: {e}" def extract_text_ocr(pdf_path: str) -> str: """使用 OCR 擷取 PDF 的圖片並辨識成文字""" try: images = convert_from_path(pdf_path, dpi=300) text = "" for i, img in enumerate(images): gray = img.convert('L') page_text = pytesseract.image_to_string(gray, lang='chi_tra') text += f"\n\n--- Page {i+1} ---\n\n" + page_text return text except Exception as e: return f"❌ OCR 擷取失敗: {e}" def extract_text(pdf_path: str, mode: str = "simple") -> str: """依模式選擇擷取方式:simple 或 ocr""" if mode == "ocr": return extract_text_ocr(pdf_path) return extract_text_simple(pdf_path)