Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

tahirsher commited on Jan 31

Commit

c06572a

verified ·

1 Parent(s): 53afdc8

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -46

app.py CHANGED Viewed

@@ -1,71 +1,72 @@
-import pdfplumber
 import pytesseract
 from transformers import pipeline
 import streamlit as st
-import os
 import docx
-from langdetect import detect
-from PIL import Image
 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
 from docx.shared import Pt
-# Load the translation model
 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
-# Ensure Tesseract path is set (modify for your environment)
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
-def extract_text_blocks_from_pdf(pdf_path):
-    """Extract text blocks while preserving structure (tables, paragraphs) from the PDF."""
-    extracted_content = []
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
             # Extract tables
             tables = page.extract_tables()
             for table in tables:
-                extracted_content.append({"type": "table", "content": table})
-            # Extract plain text
-            text_blocks = page.extract_text()
-            if text_blocks:
-                paragraphs = text_blocks.split("\n")
                 for para in paragraphs:
-                    extracted_content.append({"type": "text", "content": para})
-    return extracted_content
-def translate_content_blockwise(content_blocks):
-    """Translate text blocks and return structured results."""
-    translated_content = []
     for block in content_blocks:
         if block["type"] == "text" and block["content"].strip():
-            detected_language = detect(block["content"])
-            if detected_language != "en":
-                translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
-            else:
-                translated_text = block["content"]
-            translated_content.append({"type": "text", "content": translated_text})
         elif block["type"] == "table":
-            # Translate table rows
             translated_table = []
             for row in block["content"]:
-                translated_row = [translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row]
                 translated_table.append(translated_row)
-            translated_content.append({"type": "table", "content": translated_table})
-    return translated_content
-def generate_translated_docx(translated_content, output_path):
-    """Generate a Word document with the translated content preserving tables and formatting."""
     doc = docx.Document()
-    for block in translated_content:
         if block["type"] == "text":
             para = doc.add_paragraph(block["content"])
             para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
@@ -91,16 +92,16 @@ if uploaded_file is not None:
             f.write(uploaded_file.getbuffer())
         try:
-            # Extract structured content
-            content_blocks = extract_text_blocks_from_pdf(temp_file_path)
-            # Translate content blockwise
-            translated_content = translate_content_blockwise(content_blocks)
-            # Create translated DOCX file
             output_docx_path = "translated_document.docx"
-            generate_translated_docx(translated_content, output_docx_path)
             # Provide download link for the translated document
             with open(output_docx_path, "rb") as f:
                 st.download_button(

+import PyPDF2
 import pytesseract
+from PIL import Image
+import pdfplumber
 from transformers import pipeline
 import streamlit as st
 import docx
 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
 from docx.shared import Pt
+import os
+# Translation model pipeline
 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
+# Set Tesseract path (modify for your environment)
 pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+def extract_text_and_tables(pdf_path):
+    """Extract structured content from PDF, including tables and text."""
+    content_blocks = []
     with pdfplumber.open(pdf_path) as pdf:
         for page in pdf.pages:
             # Extract tables
             tables = page.extract_tables()
             for table in tables:
+                content_blocks.append({"type": "table", "content": table})
+            # Extract text as paragraphs
+            text = page.extract_text()
+            if not text:  # Fallback to OCR if text extraction fails
+                pix = page.to_image()
+                text = pytesseract.image_to_string(pix.original)
+            if text:
+                paragraphs = text.split("\n")
                 for para in paragraphs:
+                    content_blocks.append({"type": "text", "content": para.strip()})
+    return content_blocks
+def translate_content(content_blocks):
+    """Translate extracted content preserving structure."""
+    translated_blocks = []
     for block in content_blocks:
         if block["type"] == "text" and block["content"].strip():
+            translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
+            translated_blocks.append({"type": "text", "content": translated_text})
         elif block["type"] == "table":
             translated_table = []
             for row in block["content"]:
+                translated_row = [
+                    translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row
+                ]
                 translated_table.append(translated_row)
+            translated_blocks.append({"type": "table", "content": translated_table})
+    return translated_blocks
+def create_translated_doc(translated_blocks, output_path):
+    """Generate a translated Word document preserving tables and text."""
     doc = docx.Document()
+    for block in translated_blocks:
         if block["type"] == "text":
             para = doc.add_paragraph(block["content"])
             para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
             f.write(uploaded_file.getbuffer())
         try:
+            # Extract content from the PDF
+            content_blocks = extract_text_and_tables(temp_file_path)
+            # Translate content
+            translated_blocks = translate_content(content_blocks)
+            # Generate translated DOCX
             output_docx_path = "translated_document.docx"
+            create_translated_doc(translated_blocks, output_docx_path)
             # Provide download link for the translated document
             with open(output_docx_path, "rb") as f:
                 st.download_button(