DocSummarizer_Jimmy / pdf2text.py
Jimmy0866's picture
Upload 3 files
d1f503e verified
import os
import pytesseract
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
from typing import Literal
def extract_text_simple(pdf_path: str) -> str:
"""
使用 PyPDF2 解析 PDF 純文字
"""
reader = PdfReader(pdf_path)
all_text = []
for page in reader.pages:
text = page.extract_text()
if text:
all_text.append(text.strip())
return "\n".join(all_text)
def extract_text_ocr(pdf_path: str, dpi: int = 300) -> str:
"""
使用 Tesseract OCR 提取圖片形式的 PDF 內容
"""
images = convert_from_path(pdf_path, dpi=dpi)
all_text = []
for img in images:
text = pytesseract.image_to_string(img, lang="chi_tra+eng")
if text:
all_text.append(text.strip())
return "\n".join(all_text)
def extract_text(pdf_path: str, mode: Literal["simple", "ocr"] = "simple") -> str:
"""
根據模式選擇提取方法
"""
if mode == "ocr":
return extract_text_ocr(pdf_path)
else:
return extract_text_simple(pdf_path)
# 為 app.py 提供相容介面
def convert_PDF_to_Text(pdf_path: str, ocr_model=None, max_pages: int = 20) -> dict:
"""
模擬 app.py 所需的 convert_PDF_to_Text 介面
"""
text = extract_text(pdf_path, mode="ocr" if ocr_model else "simple")
return {
"converted_text": text,
"source_path": pdf_path,
"used_ocr": bool(ocr_model),
"page_count": "N/A",
}
convert_pdf_to_text = convert_PDF_to_Text