Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

tahirsher commited on Jan 31

Commit

5bb4750

verified ·

1 Parent(s): 63f5b6d

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -25

app.py CHANGED Viewed

@@ -1,23 +1,19 @@
-import fitz  # PyMuPDF for PDF processing
-from PIL import Image  # For image processing
 from transformers import pipeline
 import streamlit as st
 import os
 import re
-from docx import Document  # For Word document processing
-# Load the TrOCR model for image-to-text (smaller model)
 trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
-# Load the translation model (smaller model)
 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
-# Function to extract text from an image using TrOCR
 def extract_text_from_image(image):
     result = trocr_pipeline(image)
     return result[0]['generated_text'] if result else ""
-# Function to extract text from a PDF (optimized for performance)
 def extract_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     full_text = ""
@@ -26,7 +22,6 @@ def extract_from_pdf(pdf_path):
         full_text += page.get_text() + "\n"
     return full_text.strip()
-# Function to extract text from a Word document
 def extract_from_word(docx_path):
     doc = Document(docx_path)
     full_text = ""
@@ -34,42 +29,44 @@ def extract_from_word(docx_path):
         full_text += para.text + "\n"
     return full_text.strip()
-# Function to clean extracted text
 def clean_text(text):
     return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
-# Function to translate text to English (batched for performance)
 def translate_text(text):
     chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
     translated_text = ""
     for chunk in chunks:
-        if chunk.strip():
-            translated_chunk = translator(chunk, max_length=400)
-            if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
-                translated_text += translated_chunk[0]['translation_text'] + " "
     return translated_text.strip()
-# Function to create a PDF from translated text
 def create_pdf(translated_text, output_path):
     doc = fitz.open()
     page = doc.new_page()
     page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv")
     doc.save(output_path)
-# Streamlit UI
 st.title("Multilingual Document Translator")
 uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
 if uploaded_file is not None:
     with st.spinner("Processing document..."):
-        # Save the uploaded file temporarily
         file_extension = uploaded_file.name.split(".")[-1].lower()
         temp_file_path = f"temp.{file_extension}"
         with open(temp_file_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
         try:
-            # Extract text based on file type
             if file_extension == "pdf":
                 extracted_text = extract_from_pdf(temp_file_path)
             elif file_extension in ["jpg", "jpeg", "png"]:
@@ -81,21 +78,17 @@ if uploaded_file is not None:
                 st.error("Unsupported file format.")
                 st.stop()
-            # Clean and translate the extracted text
             extracted_text = clean_text(extracted_text)
-            st.write("Extracted Text for Debugging (First 500 characters):", extracted_text[:500])
             translated_text = translate_text(extracted_text)
-            # Display the translated text
             st.subheader("Translated Text (English)")
             st.write(translated_text)
-            # Create a PDF from the translated text
             output_pdf_path = "translated_document.pdf"
             create_pdf(translated_text, output_pdf_path)
-            # Provide a download link for the translated PDF
             with open(output_pdf_path, "rb") as f:
                 st.download_button(
                     label="Download Translated PDF",
@@ -104,7 +97,6 @@ if uploaded_file is not None:
                     mime="application/pdf"
                 )
         finally:
-            # Clean up temporary files
             if os.path.exists(temp_file_path):
                 os.remove(temp_file_path)
             if os.path.exists(output_pdf_path):

+import fitz
+from PIL import Image
 from transformers import pipeline
 import streamlit as st
 import os
 import re
+from docx import Document
+from langdetect import detect
 trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
 def extract_text_from_image(image):
     result = trocr_pipeline(image)
     return result[0]['generated_text'] if result else ""
 def extract_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     full_text = ""
         full_text += page.get_text() + "\n"
     return full_text.strip()
 def extract_from_word(docx_path):
     doc = Document(docx_path)
     full_text = ""
         full_text += para.text + "\n"
     return full_text.strip()
 def clean_text(text):
     return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
 def translate_text(text):
+    if not text.strip():
+        return "No text available for translation."
+    detected_language = detect(text)
+    st.write(f"Detected language: {detected_language}")
+    if detected_language == "en":
+        return "The text is already in English."
     chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
     translated_text = ""
     for chunk in chunks:
+        translated_chunk = translator(chunk, max_length=400)
+        if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
+            translated_text += translated_chunk[0]['translation_text'] + " "
     return translated_text.strip()
 def create_pdf(translated_text, output_path):
     doc = fitz.open()
     page = doc.new_page()
     page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv")
     doc.save(output_path)
 st.title("Multilingual Document Translator")
 uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
 if uploaded_file is not None:
     with st.spinner("Processing document..."):
         file_extension = uploaded_file.name.split(".")[-1].lower()
         temp_file_path = f"temp.{file_extension}"
         with open(temp_file_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
         try:
             if file_extension == "pdf":
                 extracted_text = extract_from_pdf(temp_file_path)
             elif file_extension in ["jpg", "jpeg", "png"]:
                 st.error("Unsupported file format.")
                 st.stop()
             extracted_text = clean_text(extracted_text)
+            st.write("Extracted Text (First 500 characters):", extracted_text[:500])
             translated_text = translate_text(extracted_text)
             st.subheader("Translated Text (English)")
             st.write(translated_text)
             output_pdf_path = "translated_document.pdf"
             create_pdf(translated_text, output_pdf_path)
             with open(output_pdf_path, "rb") as f:
                 st.download_button(
                     label="Download Translated PDF",
                     mime="application/pdf"
                 )
         finally:
             if os.path.exists(temp_file_path):
                 os.remove(temp_file_path)
             if os.path.exists(output_pdf_path):