import PyPDF2 import pytesseract from PIL import Image import pdfplumber from transformers import pipeline import streamlit as st import docx from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from docx.shared import Pt import os # Translation model pipeline translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en") # Set Tesseract path (modify for your environment) pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" def extract_text_and_tables(pdf_path): """Extract structured content from PDF, including tables and text.""" content_blocks = [] with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: # Extract tables tables = page.extract_tables() for table in tables: content_blocks.append({"type": "table", "content": table}) # Extract text as paragraphs text = page.extract_text() if not text: # Fallback to OCR if text extraction fails pix = page.to_image() text = pytesseract.image_to_string(pix.original) if text: paragraphs = text.split("\n") for para in paragraphs: content_blocks.append({"type": "text", "content": para.strip()}) return content_blocks def translate_content(content_blocks): """Translate extracted content preserving structure.""" translated_blocks = [] for block in content_blocks: if block["type"] == "text" and block["content"].strip(): translated_text = translator(block["content"], max_length=400)[0]["translation_text"] translated_blocks.append({"type": "text", "content": translated_text}) elif block["type"] == "table": translated_table = [] for row in block["content"]: translated_row = [ translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row ] translated_table.append(translated_row) translated_blocks.append({"type": "table", "content": translated_table}) return translated_blocks def create_translated_doc(translated_blocks, output_path): """Generate a translated Word document preserving tables and text.""" doc = docx.Document() for block in translated_blocks: if block["type"] == "text": para = doc.add_paragraph(block["content"]) para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT para.style.font.size = Pt(12) elif block["type"] == "table": table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0])) for i, row in enumerate(block["content"]): for j, cell_text in enumerate(row): table.cell(i, j).text = cell_text doc.save(output_path) # Streamlit UI st.title("Professional Multilingual PDF Translator") uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"]) output_docx_path = "translated_document.docx" # Ensure the variable is defined outside any block if uploaded_file is not None: with st.spinner("Processing and translating the document..."): temp_file_path = "uploaded_document.pdf" with open(temp_file_path, "wb") as f: f.write(uploaded_file.getbuffer()) try: # Extract content from the PDF content_blocks = extract_text_and_tables(temp_file_path) # Translate content translated_blocks = translate_content(content_blocks) # Create translated DOCX create_translated_doc(translated_blocks, output_docx_path) # Provide download link for the translated document with open(output_docx_path, "rb") as f: st.download_button( label="Download Translated Document", data=f, file_name="translated_document.docx", mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) finally: # Clean up temporary files if os.path.exists(temp_file_path): os.remove(temp_file_path) if os.path.exists(output_docx_path): os.remove(output_docx_path)