Spaces:
Sleeping
Sleeping
File size: 1,527 Bytes
d1f503e 71fadee d1f503e 321048b 71fadee d1f503e 71fadee d1f503e 71fadee d1f503e 71fadee d1f503e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import os
import pytesseract
from PyPDF2 import PdfReader
from pdf2image import convert_from_path
from typing import Literal
def extract_text_simple(pdf_path: str) -> str:
"""
使用 PyPDF2 解析 PDF 純文字
"""
reader = PdfReader(pdf_path)
all_text = []
for page in reader.pages:
text = page.extract_text()
if text:
all_text.append(text.strip())
return "\n".join(all_text)
def extract_text_ocr(pdf_path: str, dpi: int = 300) -> str:
"""
使用 Tesseract OCR 提取圖片形式的 PDF 內容
"""
images = convert_from_path(pdf_path, dpi=dpi)
all_text = []
for img in images:
text = pytesseract.image_to_string(img, lang="chi_tra+eng")
if text:
all_text.append(text.strip())
return "\n".join(all_text)
def extract_text(pdf_path: str, mode: Literal["simple", "ocr"] = "simple") -> str:
"""
根據模式選擇提取方法
"""
if mode == "ocr":
return extract_text_ocr(pdf_path)
else:
return extract_text_simple(pdf_path)
# 為 app.py 提供相容介面
def convert_PDF_to_Text(pdf_path: str, ocr_model=None, max_pages: int = 20) -> dict:
"""
模擬 app.py 所需的 convert_PDF_to_Text 介面
"""
text = extract_text(pdf_path, mode="ocr" if ocr_model else "simple")
return {
"converted_text": text,
"source_path": pdf_path,
"used_ocr": bool(ocr_model),
"page_count": "N/A",
}
convert_pdf_to_text = convert_PDF_to_Text
|