|
import PyPDF2 |
|
import pytesseract |
|
from PIL import Image |
|
import pdfplumber |
|
from transformers import pipeline |
|
import streamlit as st |
|
import docx |
|
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT |
|
from docx.shared import Pt |
|
import os |
|
|
|
|
|
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en") |
|
|
|
|
|
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract" |
|
|
|
|
|
def extract_text_and_tables(pdf_path): |
|
"""Extract structured content from PDF, including tables and text.""" |
|
content_blocks = [] |
|
|
|
with pdfplumber.open(pdf_path) as pdf: |
|
for page in pdf.pages: |
|
|
|
tables = page.extract_tables() |
|
for table in tables: |
|
content_blocks.append({"type": "table", "content": table}) |
|
|
|
|
|
text = page.extract_text() |
|
if not text: |
|
pix = page.to_image() |
|
text = pytesseract.image_to_string(pix.original) |
|
|
|
if text: |
|
paragraphs = text.split("\n") |
|
for para in paragraphs: |
|
content_blocks.append({"type": "text", "content": para.strip()}) |
|
|
|
return content_blocks |
|
|
|
|
|
def translate_content(content_blocks): |
|
"""Translate extracted content preserving structure.""" |
|
translated_blocks = [] |
|
|
|
for block in content_blocks: |
|
if block["type"] == "text" and block["content"].strip(): |
|
translated_text = translator(block["content"], max_length=400)[0]["translation_text"] |
|
translated_blocks.append({"type": "text", "content": translated_text}) |
|
|
|
elif block["type"] == "table": |
|
translated_table = [] |
|
for row in block["content"]: |
|
translated_row = [ |
|
translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row |
|
] |
|
translated_table.append(translated_row) |
|
translated_blocks.append({"type": "table", "content": translated_table}) |
|
|
|
return translated_blocks |
|
|
|
|
|
def create_translated_doc(translated_blocks, output_path): |
|
"""Generate a translated Word document preserving tables and text.""" |
|
doc = docx.Document() |
|
|
|
for block in translated_blocks: |
|
if block["type"] == "text": |
|
para = doc.add_paragraph(block["content"]) |
|
para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT |
|
para.style.font.size = Pt(12) |
|
|
|
elif block["type"] == "table": |
|
table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0])) |
|
for i, row in enumerate(block["content"]): |
|
for j, cell_text in enumerate(row): |
|
table.cell(i, j).text = cell_text |
|
|
|
doc.save(output_path) |
|
|
|
|
|
|
|
st.title("Professional Multilingual PDF Translator") |
|
uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"]) |
|
|
|
output_docx_path = "translated_document.docx" |
|
|
|
if uploaded_file is not None: |
|
with st.spinner("Processing and translating the document..."): |
|
temp_file_path = "uploaded_document.pdf" |
|
with open(temp_file_path, "wb") as f: |
|
f.write(uploaded_file.getbuffer()) |
|
|
|
try: |
|
|
|
content_blocks = extract_text_and_tables(temp_file_path) |
|
|
|
|
|
translated_blocks = translate_content(content_blocks) |
|
|
|
|
|
create_translated_doc(translated_blocks, output_docx_path) |
|
|
|
|
|
with open(output_docx_path, "rb") as f: |
|
st.download_button( |
|
label="Download Translated Document", |
|
data=f, |
|
file_name="translated_document.docx", |
|
mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document" |
|
) |
|
finally: |
|
|
|
if os.path.exists(temp_file_path): |
|
os.remove(temp_file_path) |
|
if os.path.exists(output_docx_path): |
|
os.remove(output_docx_path) |
|
|