File size: 4,371 Bytes
c06572a 40548f3 c06572a 28d4d28 7037128 28d4d28 c06572a 04f9dd5 c06572a add3a0f ade99c4 c06572a 40548f3 b57bd69 c06572a 28d4d28 c06572a 28d4d28 c06572a 28d4d28 c06572a 28d4d28 c06572a 28d4d28 c06572a 28d4d28 c06572a 5bb4750 c06572a 28d4d28 87fcfea c06572a 28d4d28 b57bd69 87fcfea ade99c4 b57bd69 28d4d28 ade99c4 5c859a7 7037128 28d4d28 87fcfea 7037128 87fcfea 63f5b6d c06572a 5c859a7 c06572a 28d4d28 63f5b6d 28d4d28 63f5b6d 28d4d28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
import PyPDF2
import pytesseract
from PIL import Image
import pdfplumber
from transformers import pipeline
import streamlit as st
import docx
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt
import os
# Translation model pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
# Set Tesseract path (modify for your environment)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
def extract_text_and_tables(pdf_path):
"""Extract structured content from PDF, including tables and text."""
content_blocks = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# Extract tables
tables = page.extract_tables()
for table in tables:
content_blocks.append({"type": "table", "content": table})
# Extract text as paragraphs
text = page.extract_text()
if not text: # Fallback to OCR if text extraction fails
pix = page.to_image()
text = pytesseract.image_to_string(pix.original)
if text:
paragraphs = text.split("\n")
for para in paragraphs:
content_blocks.append({"type": "text", "content": para.strip()})
return content_blocks
def translate_content(content_blocks):
"""Translate extracted content preserving structure."""
translated_blocks = []
for block in content_blocks:
if block["type"] == "text" and block["content"].strip():
translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
translated_blocks.append({"type": "text", "content": translated_text})
elif block["type"] == "table":
translated_table = []
for row in block["content"]:
translated_row = [
translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row
]
translated_table.append(translated_row)
translated_blocks.append({"type": "table", "content": translated_table})
return translated_blocks
def create_translated_doc(translated_blocks, output_path):
"""Generate a translated Word document preserving tables and text."""
doc = docx.Document()
for block in translated_blocks:
if block["type"] == "text":
para = doc.add_paragraph(block["content"])
para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
para.style.font.size = Pt(12)
elif block["type"] == "table":
table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0]))
for i, row in enumerate(block["content"]):
for j, cell_text in enumerate(row):
table.cell(i, j).text = cell_text
doc.save(output_path)
# Streamlit UI
st.title("Professional Multilingual PDF Translator")
uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"])
output_docx_path = "translated_document.docx" # Ensure the variable is defined outside any block
if uploaded_file is not None:
with st.spinner("Processing and translating the document..."):
temp_file_path = "uploaded_document.pdf"
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
try:
# Extract content from the PDF
content_blocks = extract_text_and_tables(temp_file_path)
# Translate content
translated_blocks = translate_content(content_blocks)
# Create translated DOCX
create_translated_doc(translated_blocks, output_docx_path)
# Provide download link for the translated document
with open(output_docx_path, "rb") as f:
st.download_button(
label="Download Translated Document",
data=f,
file_name="translated_document.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
finally:
# Clean up temporary files
if os.path.exists(temp_file_path):
os.remove(temp_file_path)
if os.path.exists(output_docx_path):
os.remove(output_docx_path)
|