tahirsher's picture
Update app.py
5c859a7 verified
raw
history blame
4.37 kB
import PyPDF2
import pytesseract
from PIL import Image
import pdfplumber
from transformers import pipeline
import streamlit as st
import docx
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt
import os
# Translation model pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
# Set Tesseract path (modify for your environment)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
def extract_text_and_tables(pdf_path):
"""Extract structured content from PDF, including tables and text."""
content_blocks = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# Extract tables
tables = page.extract_tables()
for table in tables:
content_blocks.append({"type": "table", "content": table})
# Extract text as paragraphs
text = page.extract_text()
if not text: # Fallback to OCR if text extraction fails
pix = page.to_image()
text = pytesseract.image_to_string(pix.original)
if text:
paragraphs = text.split("\n")
for para in paragraphs:
content_blocks.append({"type": "text", "content": para.strip()})
return content_blocks
def translate_content(content_blocks):
"""Translate extracted content preserving structure."""
translated_blocks = []
for block in content_blocks:
if block["type"] == "text" and block["content"].strip():
translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
translated_blocks.append({"type": "text", "content": translated_text})
elif block["type"] == "table":
translated_table = []
for row in block["content"]:
translated_row = [
translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row
]
translated_table.append(translated_row)
translated_blocks.append({"type": "table", "content": translated_table})
return translated_blocks
def create_translated_doc(translated_blocks, output_path):
"""Generate a translated Word document preserving tables and text."""
doc = docx.Document()
for block in translated_blocks:
if block["type"] == "text":
para = doc.add_paragraph(block["content"])
para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
para.style.font.size = Pt(12)
elif block["type"] == "table":
table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0]))
for i, row in enumerate(block["content"]):
for j, cell_text in enumerate(row):
table.cell(i, j).text = cell_text
doc.save(output_path)
# Streamlit UI
st.title("Professional Multilingual PDF Translator")
uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"])
output_docx_path = "translated_document.docx" # Ensure the variable is defined outside any block
if uploaded_file is not None:
with st.spinner("Processing and translating the document..."):
temp_file_path = "uploaded_document.pdf"
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
try:
# Extract content from the PDF
content_blocks = extract_text_and_tables(temp_file_path)
# Translate content
translated_blocks = translate_content(content_blocks)
# Create translated DOCX
create_translated_doc(translated_blocks, output_docx_path)
# Provide download link for the translated document
with open(output_docx_path, "rb") as f:
st.download_button(
label="Download Translated Document",
data=f,
file_name="translated_document.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
finally:
# Clean up temporary files
if os.path.exists(temp_file_path):
os.remove(temp_file_path)
if os.path.exists(output_docx_path):
os.remove(output_docx_path)