File size: 4,371 Bytes
c06572a
40548f3
c06572a
 
28d4d28
7037128
28d4d28
 
 
c06572a
04f9dd5
c06572a
add3a0f
ade99c4
c06572a
40548f3
 
b57bd69
c06572a
 
 
 
28d4d28
 
 
 
 
c06572a
 
 
 
 
 
 
 
 
 
28d4d28
c06572a
 
 
28d4d28
 
c06572a
 
 
28d4d28
 
 
c06572a
 
 
28d4d28
 
 
c06572a
 
 
28d4d28
c06572a
 
 
5bb4750
 
c06572a
 
28d4d28
87fcfea
c06572a
28d4d28
 
 
 
 
 
 
 
 
 
b57bd69
87fcfea
ade99c4
b57bd69
28d4d28
 
 
ade99c4
5c859a7
 
7037128
28d4d28
 
87fcfea
7037128
87fcfea
63f5b6d
c06572a
 
 
 
 
 
5c859a7
c06572a
 
28d4d28
 
 
 
 
 
 
 
63f5b6d
28d4d28
63f5b6d
 
28d4d28
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import PyPDF2
import pytesseract
from PIL import Image
import pdfplumber
from transformers import pipeline
import streamlit as st
import docx
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt
import os

# Translation model pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

# Set Tesseract path (modify for your environment)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


def extract_text_and_tables(pdf_path):
    """Extract structured content from PDF, including tables and text."""
    content_blocks = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract tables
            tables = page.extract_tables()
            for table in tables:
                content_blocks.append({"type": "table", "content": table})

            # Extract text as paragraphs
            text = page.extract_text()
            if not text:  # Fallback to OCR if text extraction fails
                pix = page.to_image()
                text = pytesseract.image_to_string(pix.original)

            if text:
                paragraphs = text.split("\n")
                for para in paragraphs:
                    content_blocks.append({"type": "text", "content": para.strip()})

    return content_blocks


def translate_content(content_blocks):
    """Translate extracted content preserving structure."""
    translated_blocks = []

    for block in content_blocks:
        if block["type"] == "text" and block["content"].strip():
            translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
            translated_blocks.append({"type": "text", "content": translated_text})

        elif block["type"] == "table":
            translated_table = []
            for row in block["content"]:
                translated_row = [
                    translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row
                ]
                translated_table.append(translated_row)
            translated_blocks.append({"type": "table", "content": translated_table})

    return translated_blocks


def create_translated_doc(translated_blocks, output_path):
    """Generate a translated Word document preserving tables and text."""
    doc = docx.Document()

    for block in translated_blocks:
        if block["type"] == "text":
            para = doc.add_paragraph(block["content"])
            para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
            para.style.font.size = Pt(12)

        elif block["type"] == "table":
            table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0]))
            for i, row in enumerate(block["content"]):
                for j, cell_text in enumerate(row):
                    table.cell(i, j).text = cell_text

    doc.save(output_path)


# Streamlit UI
st.title("Professional Multilingual PDF Translator")
uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"])

output_docx_path = "translated_document.docx"  # Ensure the variable is defined outside any block

if uploaded_file is not None:
    with st.spinner("Processing and translating the document..."):
        temp_file_path = "uploaded_document.pdf"
        with open(temp_file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

        try:
            # Extract content from the PDF
            content_blocks = extract_text_and_tables(temp_file_path)

            # Translate content
            translated_blocks = translate_content(content_blocks)

            # Create translated DOCX
            create_translated_doc(translated_blocks, output_docx_path)

            # Provide download link for the translated document
            with open(output_docx_path, "rb") as f:
                st.download_button(
                    label="Download Translated Document",
                    data=f,
                    file_name="translated_document.docx",
                    mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                )
        finally:
            # Clean up temporary files
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)
            if os.path.exists(output_docx_path):
                os.remove(output_docx_path)