Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

tahirsher commited on Jan 31

Commit

87fcfea

verified ·

1 Parent(s): f9d2d7a

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -47

app.py CHANGED Viewed

@@ -1,33 +1,30 @@
-import fitz  # PyMuPDF
-import pytesseract
-from PIL import Image
-from transformers import pipeline
 import streamlit as st
 import os
 import io
-# Set the Tesseract path explicitly
-pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'  # Default path in most Linux systems
-# Set up the translation pipelines with error handling
-try:
-    translator_to_english = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
-except Exception as e:
-    st.error(f"Failed to load English translation model: {e}")
-    translator_to_english = None
-try:
-    translator_to_urdu = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
-except Exception as e:
-    st.error(f"Failed to load Urdu translation model: {e}")
-    translator_to_urdu = None
-# Function to extract text from an image using OCR
 def extract_text_from_image(image):
-    text = pytesseract.image_to_string(image, lang='eng+urd')
     return text
-# Function to extract images and text from a PDF
 def extract_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     full_text = ""
@@ -44,38 +41,70 @@ def extract_from_pdf(pdf_path):
         full_text += page.get_text() + "\n"
     return full_text
-# Function to translate text to English and Urdu
 def translate_text(text):
-    english_translation = ""
-    urdu_translation = ""
-    if translator_to_english:
-        english_translation = translator_to_english(text, max_length=400)[0]['translation_text']
-    if translator_to_urdu:
-        urdu_translation = translator_to_urdu(text, max_length=400)[0]['translation_text']
-    return english_translation, urdu_translation
 # Streamlit UI
-st.title("PDF Document Translator")
-uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
 if uploaded_file is not None:
-    with st.spinner("Processing PDF..."):
         # Save the uploaded file temporarily
-        with open("temp.pdf", "wb") as f:
             f.write(uploaded_file.getbuffer())
-        # Extract text from the PDF
-        extracted_text = extract_from_pdf("temp.pdf")
         # Translate the extracted text
-        english_translation, urdu_translation = translate_text(extracted_text)
-        # Display the translations
-        st.subheader("English Translation")
-        st.write(english_translation)
-        st.subheader("Urdu Translation")
-        st.write(urdu_translation)
-        # Clean up the temporary file
-        os.remove("temp.pdf")

+import fitz  # PyMuPDF for PDF processing
+from PIL import Image  # For image processing
+from transformers import AutoTokenizer, AutoModelForImageTextToText, AutoModelForCausalLM, pipeline
 import streamlit as st
 import os
 import io
+from docx import Document  # For Word document processing
+# Load the TrOCR model for image-to-text
+trocr_tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
+trocr_model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
+# Load the DeepSeek model for text-to-text translation
+translation_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
+translation_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
+# Set up the translation pipeline
+translator = pipeline("text-generation", model=translation_model, tokenizer=translation_tokenizer)
+# Function to extract text from an image using TrOCR
 def extract_text_from_image(image):
+    inputs = trocr_tokenizer(image, return_tensors="pt").input_ids
+    outputs = trocr_model.generate(inputs)
+    text = trocr_tokenizer.decode(outputs[0], skip_special_tokens=True)
     return text
+# Function to extract text from a PDF
 def extract_from_pdf(pdf_path):
     doc = fitz.open(pdf_path)
     full_text = ""
         full_text += page.get_text() + "\n"
     return full_text
+# Function to extract text from a Word document
+def extract_from_word(docx_path):
+    doc = Document(docx_path)
+    full_text = ""
+    for para in doc.paragraphs:
+        full_text += para.text + "\n"
+    return full_text
+# Function to translate text to English
 def translate_text(text):
+    translated_text = translator(text, max_length=400)[0]['generated_text']
+    return translated_text
+# Function to create a PDF from translated text
+def create_pdf(translated_text, output_path):
+    doc = fitz.open()
+    page = doc.new_page()
+    page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv")
+    doc.save(output_path)
 # Streamlit UI
+st.title("Multilingual Document Translator")
+uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
 if uploaded_file is not None:
+    with st.spinner("Processing document..."):
         # Save the uploaded file temporarily
+        file_extension = uploaded_file.name.split(".")[-1].lower()
+        temp_file_path = f"temp.{file_extension}"
+        with open(temp_file_path, "wb") as f:
             f.write(uploaded_file.getbuffer())
+        # Extract text based on file type
+        if file_extension == "pdf":
+            extracted_text = extract_from_pdf(temp_file_path)
+        elif file_extension in ["jpg", "jpeg", "png"]:
+            image = Image.open(temp_file_path)
+            extracted_text = extract_text_from_image(image)
+        elif file_extension == "docx":
+            extracted_text = extract_from_word(temp_file_path)
+        else:
+            st.error("Unsupported file format.")
+            st.stop()
         # Translate the extracted text
+        translated_text = translate_text(extracted_text)
+        # Display the translated text
+        st.subheader("Translated Text (English)")
+        st.write(translated_text)
+        # Create a PDF from the translated text
+        output_pdf_path = "translated_document.pdf"
+        create_pdf(translated_text, output_pdf_path)
+        # Provide a download link for the translated PDF
+        with open(output_pdf_path, "rb") as f:
+            st.download_button(
+                label="Download Translated PDF",
+                data=f,
+                file_name="translated_document.pdf",
+                mime="application/pdf"
+            )
+        # Clean up temporary files
+        os.remove(temp_file_path)
+        os.remove(output_pdf_path)