Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

tahirsher commited on Jan 31

Commit

63f5b6d

verified ·

1 Parent(s): b0b875d

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -44

app.py CHANGED Viewed

@@ -3,9 +3,8 @@ from PIL import Image  # For image processing
 from transformers import pipeline
 import streamlit as st
 import os
-import io
 from docx import Document  # For Word document processing
-import asyncio  # For asynchronous processing
 # Load the TrOCR model for image-to-text (smaller model)
 trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
@@ -16,7 +15,7 @@ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
 # Function to extract text from an image using TrOCR
 def extract_text_from_image(image):
     result = trocr_pipeline(image)
-    return result[0]['generated_text']
 # Function to extract text from a PDF (optimized for performance)
 def extract_from_pdf(pdf_path):
@@ -24,9 +23,8 @@ def extract_from_pdf(pdf_path):
     full_text = ""
     for page_num in range(len(doc)):
         page = doc.load_page(page_num)
-        # Extract text directly from the page (faster than OCR for text-based PDFs)
         full_text += page.get_text() + "\n"
-    return full_text
 # Function to extract text from a Word document
 def extract_from_word(docx_path):
@@ -34,16 +32,21 @@ def extract_from_word(docx_path):
     full_text = ""
     for para in doc.paragraphs:
         full_text += para.text + "\n"
-    return full_text
 # Function to translate text to English (batched for performance)
 def translate_text(text):
-    # Split text into smaller chunks for faster translation
     chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
     translated_text = ""
     for chunk in chunks:
-        translated_chunk = translator(chunk, max_length=400)[0]['translation_text']
-        translated_text += translated_chunk + " "
     return translated_text.strip()
 # Function to create a PDF from translated text
@@ -65,38 +68,44 @@ if uploaded_file is not None:
         with open(temp_file_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
-        # Extract text based on file type
-        if file_extension == "pdf":
-            extracted_text = extract_from_pdf(temp_file_path)
-        elif file_extension in ["jpg", "jpeg", "png"]:
-            image = Image.open(temp_file_path)
-            extracted_text = extract_text_from_image(image)
-        elif file_extension == "docx":
-            extracted_text = extract_from_word(temp_file_path)
-        else:
-            st.error("Unsupported file format.")
-            st.stop()
-        # Translate the extracted text
-        translated_text = translate_text(extracted_text)
-        # Display the translated text
-        st.subheader("Translated Text (English)")
-        st.write(translated_text)
-        # Create a PDF from the translated text
-        output_pdf_path = "translated_document.pdf"
-        create_pdf(translated_text, output_pdf_path)
-        # Provide a download link for the translated PDF
-        with open(output_pdf_path, "rb") as f:
-            st.download_button(
-                label="Download Translated PDF",
-                data=f,
-                file_name="translated_document.pdf",
-                mime="application/pdf"
-            )
-        # Clean up temporary files
-        os.remove(temp_file_path)
-        os.remove(output_pdf_path)

 from transformers import pipeline
 import streamlit as st
 import os
+import re
 from docx import Document  # For Word document processing
 # Load the TrOCR model for image-to-text (smaller model)
 trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
 # Function to extract text from an image using TrOCR
 def extract_text_from_image(image):
     result = trocr_pipeline(image)
+    return result[0]['generated_text'] if result else ""
 # Function to extract text from a PDF (optimized for performance)
 def extract_from_pdf(pdf_path):
     full_text = ""
     for page_num in range(len(doc)):
         page = doc.load_page(page_num)
         full_text += page.get_text() + "\n"
+    return full_text.strip()
 # Function to extract text from a Word document
 def extract_from_word(docx_path):
     full_text = ""
     for para in doc.paragraphs:
         full_text += para.text + "\n"
+    return full_text.strip()
+# Function to clean extracted text
+def clean_text(text):
+    return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
 # Function to translate text to English (batched for performance)
 def translate_text(text):
     chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
     translated_text = ""
     for chunk in chunks:
+        if chunk.strip():
+            translated_chunk = translator(chunk, max_length=400)
+            if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
+                translated_text += translated_chunk[0]['translation_text'] + " "
     return translated_text.strip()
 # Function to create a PDF from translated text
         with open(temp_file_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
+        try:
+            # Extract text based on file type
+            if file_extension == "pdf":
+                extracted_text = extract_from_pdf(temp_file_path)
+            elif file_extension in ["jpg", "jpeg", "png"]:
+                image = Image.open(temp_file_path)
+                extracted_text = extract_text_from_image(image)
+            elif file_extension == "docx":
+                extracted_text = extract_from_word(temp_file_path)
+            else:
+                st.error("Unsupported file format.")
+                st.stop()
+            # Clean and translate the extracted text
+            extracted_text = clean_text(extracted_text)
+            st.write("Extracted Text for Debugging (First 500 characters):", extracted_text[:500])
+            translated_text = translate_text(extracted_text)
+            # Display the translated text
+            st.subheader("Translated Text (English)")
+            st.write(translated_text)
+            # Create a PDF from the translated text
+            output_pdf_path = "translated_document.pdf"
+            create_pdf(translated_text, output_pdf_path)
+            # Provide a download link for the translated PDF
+            with open(output_pdf_path, "rb") as f:
+                st.download_button(
+                    label="Download Translated PDF",
+                    data=f,
+                    file_name="translated_document.pdf",
+                    mime="application/pdf"
+                )
+        finally:
+            # Clean up temporary files
+            if os.path.exists(temp_file_path):
+                os.remove(temp_file_path)
+            if os.path.exists(output_pdf_path):
+                os.remove(output_pdf_path)