Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

tahirsher commited on Jan 31

Commit

28d4d28

verified ·

1 Parent(s): 91f9ddd

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -125

app.py CHANGED Viewed

@@ -1,153 +1,117 @@
-import fitz  # PyMuPDF for PDF processing
-from PIL import Image
 import pytesseract
-from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
 import streamlit as st
 import os
-import re
-from docx import Document
 from langdetect import detect
-# Initialize BLIP-2 model and processor for image-to-text
-processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
-model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
-# Initialize translation pipeline
 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
-# Path to Tesseract executable for OCR
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
-def extract_text_from_image(image):
-    """Extract text from image using OCR or BLIP-2."""
-    # First try using BLIP-2
-    image = image.convert("RGB")
-    inputs = processor(images=image, return_tensors="pt")
-    generated_ids = model.generate(**inputs)
-    decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    # Fallback to OCR if BLIP-2 extraction fails
-    if not decoded_text.strip():
-        decoded_text = pytesseract.image_to_string(image)
-    return decoded_text.strip()
-def extract_from_pdf(pdf_path):
-    """Extract text from PDF by combining direct extraction and OCR fallback."""
-    doc = fitz.open(pdf_path)
-    full_text = ""
-    for page_num in range(len(doc)):
-        page = doc.load_page(page_num)
-        # Try extracting text directly
-        text = page.get_text()
-        # If no text, fallback to OCR
-        if not text.strip():
-            pix = page.get_pixmap()
-            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-            text = extract_text_from_image(image)
-        full_text += text + "\n"
-    return full_text.strip()
-def extract_from_word(docx_path):
-    doc = Document(docx_path)
-    full_text = ""
-    for para in doc.paragraphs:
-        full_text += para.text + "\n"
-    return full_text.strip()
-def clean_text(text):
-    return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
-def translate_text(text):
-    if not text.strip():
-        return "No text available for translation."
-    detected_language = detect(text)
-    st.write(f"Detected language: {detected_language}")
-    if detected_language == "en":
-        return "The text is already in English."
-    chunks = [text[i:i + 50000] for i in range(0, len(text), 50000)]
-    translated_text = ""
-    for chunk in chunks:
-        translated_chunk = translator(chunk, max_length=400)
-        if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
-            translated_text += translated_chunk[0]['translation_text'] + " "
-    return translated_text.strip()
-def create_pdf(translated_text, output_path):
-    doc = fitz.open()
-    page = doc.new_page()
-    # Define text insertion rectangle
-    rect = fitz.Rect(50, 50, 550, 750)
-    # Insert text using the defined rectangle
-    page.insert_textbox(
-        rect, translated_text,
-        fontsize=12,
-        fontname="helv",
-        color=(0, 0, 0),
-    )
     doc.save(output_path)
-st.title("Multilingual Document Translator")
-uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
 if uploaded_file is not None:
-    with st.spinner("Processing document..."):
-        file_extension = uploaded_file.name.split(".")[-1].lower()
-        temp_file_path = f"temp.{file_extension}"
         with open(temp_file_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
         try:
-            if file_extension == "pdf":
-                extracted_text = extract_from_pdf(temp_file_path)
-            elif file_extension in ["jpg", "jpeg", "png"]:
-                image = Image.open(temp_file_path)
-                extracted_text = extract_text_from_image(image)
-            elif file_extension == "docx":
-                extracted_text = extract_from_word(temp_file_path)
-            else:
-                st.error("Unsupported file format.")
-                st.stop()
-            extracted_text = clean_text(extracted_text)
-            st.write("Extracted Text (First 50000 characters):", extracted_text[:50000])
-            translated_text = translate_text(extracted_text)
-            st.subheader("Translated Text (English)")
-            st.write(translated_text)
-            if translated_text.strip():
-                output_pdf_path = "translated_document.pdf"
-                create_pdf(translated_text, output_pdf_path)
-                with open(output_pdf_path, "rb") as f:
-                    st.download_button(
-                        label="Download Translated PDF",
-                        data=f,
-                        file_name="translated_document.pdf",
-                        mime="application/pdf"
-                    )
-            else:
-                st.warning("No content to save in the translated PDF.")
         finally:
             if os.path.exists(temp_file_path):
                 os.remove(temp_file_path)
-            if os.path.exists("translated_document.pdf"):
-                os.remove("translated_document.pdf")

+import pdfplumber
 import pytesseract
+from transformers import pipeline
 import streamlit as st
 import os
+import docx
 from langdetect import detect
+from PIL import Image
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
+from docx.shared import Pt
+# Load the translation model
 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
+# Ensure Tesseract path is set (modify for your environment)
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+def extract_text_blocks_from_pdf(pdf_path):
+    """Extract text blocks while preserving structure (tables, paragraphs) from the PDF."""
+    extracted_content = []
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
+            # Extract tables
+            tables = page.extract_tables()
+            for table in tables:
+                extracted_content.append({"type": "table", "content": table})
+            # Extract plain text
+            text_blocks = page.extract_text()
+            if text_blocks:
+                paragraphs = text_blocks.split("\n")
+                for para in paragraphs:
+                    extracted_content.append({"type": "text", "content": para})
+    return extracted_content
+def translate_content_blockwise(content_blocks):
+    """Translate text blocks and return structured results."""
+    translated_content = []
+    for block in content_blocks:
+        if block["type"] == "text" and block["content"].strip():
+            detected_language = detect(block["content"])
+            if detected_language != "en":
+                translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
+            else:
+                translated_text = block["content"]
+            translated_content.append({"type": "text", "content": translated_text})
+        elif block["type"] == "table":
+            # Translate table rows
+            translated_table = []
+            for row in block["content"]:
+                translated_row = [translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row]
+                translated_table.append(translated_row)
+            translated_content.append({"type": "table", "content": translated_table})
+    return translated_content
+def generate_translated_docx(translated_content, output_path):
+    """Generate a Word document with the translated content preserving tables and formatting."""
+    doc = docx.Document()
+    for block in translated_content:
+        if block["type"] == "text":
+            para = doc.add_paragraph(block["content"])
+            para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
+            para.style.font.size = Pt(12)
+        elif block["type"] == "table":
+            table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0]))
+            for i, row in enumerate(block["content"]):
+                for j, cell_text in enumerate(row):
+                    table.cell(i, j).text = cell_text
     doc.save(output_path)
+# Streamlit UI
+st.title("Professional Multilingual PDF Translator")
+uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"])
 if uploaded_file is not None:
+    with st.spinner("Processing and translating the document..."):
+        temp_file_path = "uploaded_document.pdf"
         with open(temp_file_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
         try:
+            # Extract structured content
+            content_blocks = extract_text_blocks_from_pdf(temp_file_path)
+            # Translate content blockwise
+            translated_content = translate_content_blockwise(content_blocks)
+            # Create translated DOCX file
+            output_docx_path = "translated_document.docx"
+            generate_translated_docx(translated_content, output_docx_path)
+            # Provide download link for the translated document
+            with open(output_docx_path, "rb") as f:
+                st.download_button(
+                    label="Download Translated Document",
+                    data=f,
+                    file_name="translated_document.docx",
+                    mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+                )
         finally:
+            # Clean up temporary files
             if os.path.exists(temp_file_path):
                 os.remove(temp_file_path)
+            if os.path.exists(output_docx_path):
+                os.remove(output_docx_path)