tahirsher's picture
Update app.py
5c859a7 verified
import PyPDF2
import pytesseract
from PIL import Image
import pdfplumber
from transformers import pipeline
import streamlit as st
import docx
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt
import os
# Translation model pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
# Set Tesseract path (modify for your environment)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
def extract_text_and_tables(pdf_path):
"""Extract structured content from PDF, including tables and text."""
content_blocks = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# Extract tables
tables = page.extract_tables()
for table in tables:
content_blocks.append({"type": "table", "content": table})
# Extract text as paragraphs
text = page.extract_text()
if not text: # Fallback to OCR if text extraction fails
pix = page.to_image()
text = pytesseract.image_to_string(pix.original)
if text:
paragraphs = text.split("\n")
for para in paragraphs:
content_blocks.append({"type": "text", "content": para.strip()})
return content_blocks
def translate_content(content_blocks):
"""Translate extracted content preserving structure."""
translated_blocks = []
for block in content_blocks:
if block["type"] == "text" and block["content"].strip():
translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
translated_blocks.append({"type": "text", "content": translated_text})
elif block["type"] == "table":
translated_table = []
for row in block["content"]:
translated_row = [
translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row
]
translated_table.append(translated_row)
translated_blocks.append({"type": "table", "content": translated_table})
return translated_blocks
def create_translated_doc(translated_blocks, output_path):
"""Generate a translated Word document preserving tables and text."""
doc = docx.Document()
for block in translated_blocks:
if block["type"] == "text":
para = doc.add_paragraph(block["content"])
para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
para.style.font.size = Pt(12)
elif block["type"] == "table":
table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0]))
for i, row in enumerate(block["content"]):
for j, cell_text in enumerate(row):
table.cell(i, j).text = cell_text
doc.save(output_path)
# Streamlit UI
st.title("Professional Multilingual PDF Translator")
uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"])
output_docx_path = "translated_document.docx" # Ensure the variable is defined outside any block
if uploaded_file is not None:
with st.spinner("Processing and translating the document..."):
temp_file_path = "uploaded_document.pdf"
with open(temp_file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
try:
# Extract content from the PDF
content_blocks = extract_text_and_tables(temp_file_path)
# Translate content
translated_blocks = translate_content(content_blocks)
# Create translated DOCX
create_translated_doc(translated_blocks, output_docx_path)
# Provide download link for the translated document
with open(output_docx_path, "rb") as f:
st.download_button(
label="Download Translated Document",
data=f,
file_name="translated_document.docx",
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
)
finally:
# Clean up temporary files
if os.path.exists(temp_file_path):
os.remove(temp_file_path)
if os.path.exists(output_docx_path):
os.remove(output_docx_path)