import PyPDF2
import pytesseract
from PIL import Image
import pdfplumber
from transformers import pipeline
import streamlit as st
import docx
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from docx.shared import Pt
import os

# Translation model pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")

# Set Tesseract path (modify for your environment)
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"


def extract_text_and_tables(pdf_path):
    """Extract structured content from PDF, including tables and text."""
    content_blocks = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract tables
            tables = page.extract_tables()
            for table in tables:
                content_blocks.append({"type": "table", "content": table})

            # Extract text as paragraphs
            text = page.extract_text()
            if not text:  # Fallback to OCR if text extraction fails
                pix = page.to_image()
                text = pytesseract.image_to_string(pix.original)

            if text:
                paragraphs = text.split("\n")
                for para in paragraphs:
                    content_blocks.append({"type": "text", "content": para.strip()})

    return content_blocks


def translate_content(content_blocks):
    """Translate extracted content preserving structure."""
    translated_blocks = []

    for block in content_blocks:
        if block["type"] == "text" and block["content"].strip():
            translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
            translated_blocks.append({"type": "text", "content": translated_text})

        elif block["type"] == "table":
            translated_table = []
            for row in block["content"]:
                translated_row = [
                    translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row
                ]
                translated_table.append(translated_row)
            translated_blocks.append({"type": "table", "content": translated_table})

    return translated_blocks


def create_translated_doc(translated_blocks, output_path):
    """Generate a translated Word document preserving tables and text."""
    doc = docx.Document()

    for block in translated_blocks:
        if block["type"] == "text":
            para = doc.add_paragraph(block["content"])
            para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
            para.style.font.size = Pt(12)

        elif block["type"] == "table":
            table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0]))
            for i, row in enumerate(block["content"]):
                for j, cell_text in enumerate(row):
                    table.cell(i, j).text = cell_text

    doc.save(output_path)


# Streamlit UI
st.title("Professional Multilingual PDF Translator")
uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"])

output_docx_path = "translated_document.docx"  # Ensure the variable is defined outside any block

if uploaded_file is not None:
    with st.spinner("Processing and translating the document..."):
        temp_file_path = "uploaded_document.pdf"
        with open(temp_file_path, "wb") as f:
            f.write(uploaded_file.getbuffer())

        try:
            # Extract content from the PDF
            content_blocks = extract_text_and_tables(temp_file_path)

            # Translate content
            translated_blocks = translate_content(content_blocks)

            # Create translated DOCX
            create_translated_doc(translated_blocks, output_docx_path)

            # Provide download link for the translated document
            with open(output_docx_path, "rb") as f:
                st.download_button(
                    label="Download Translated Document",
                    data=f,
                    file_name="translated_document.docx",
                    mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
                )
        finally:
            # Clean up temporary files
            if os.path.exists(temp_file_path):
                os.remove(temp_file_path)
            if os.path.exists(output_docx_path):
                os.remove(output_docx_path)