Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

tahirsher commited on Jan 28

Commit

ded567d

verified ·

1 Parent(s): 59f49e8

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -43

app.py CHANGED Viewed

@@ -2,11 +2,12 @@ import streamlit as st
 import PyPDF2
 import docx2txt
 from transformers import pipeline
-import sentencepiece
 # Load translation models
 def load_translation_models():
-    """Load translation models for English and Urdu."""
     try:
         translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
         translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
@@ -17,59 +18,33 @@ def load_translation_models():
 translator_en, translator_ur = load_translation_models()
-def extract_text_from_pdf(file):
-    """Extract text from a PDF file."""
     text = ""
     try:
-        pdf_reader = PyPDF2.PdfReader(file)
-        for page in pdf_reader.pages:
-            text += page.extract_text() or ""
     except Exception as e:
-        st.error(f"Error extracting text from PDF: {e}")
     return text
-def extract_text_from_word(file):
-    """Extract text from a Word file."""
-    try:
-        return docx2txt.process(file)
-    except Exception as e:
-        st.error(f"Error extracting text from Word document: {e}")
-        return ""
-def translate_text(text, translator):
-    """Translate text in manageable chunks."""
-    max_chunk_size = 512
-    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
-    translations = []
-    for chunk in text_chunks:
-        try:
-            result = translator(chunk)
-            translations.append(result[0]['translation_text'])
-        except Exception as e:
-            st.error(f"Error during translation: {e}")
-            return ""
-    return " ".join(translations)
-# Streamlit UI
-st.title("📚 Multilingual Document Translator")
-st.write("Translate PDF or Word documents to English and Urdu effortlessly!")
 uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
 target_language = st.radio("Select target language for translation", ["English", "Urdu"])
 if uploaded_file:
-    # Extract text from the uploaded file
-    if uploaded_file.name.endswith(".pdf"):
-        text_content = extract_text_from_pdf(uploaded_file)
-    else:
-        text_content = extract_text_from_word(uploaded_file)
-    # Show extracted text preview
     st.subheader("Extracted Text (Preview)")
     st.write(text_content[:500] if text_content else "No content found in the file.")
-    # Perform translation when the user clicks the button
     if st.button("Translate"):
         if text_content:
             st.subheader(f"Translated Text ({target_language})")

 import PyPDF2
 import docx2txt
 from transformers import pipeline
+import pytesseract
+from pdf2image import convert_from_path
+from PIL import Image
 # Load translation models
 def load_translation_models():
     try:
         translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
         translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
 translator_en, translator_ur = load_translation_models()
+def extract_text_from_pdf_with_ocr(file_path):
+    """Extract text from image-based PDF using OCR."""
     text = ""
     try:
+        # Convert PDF to images
+        pages = convert_from_path(file_path, 300)
+        for page in pages:
+            image = Image.fromarray(page)
+            text += pytesseract.image_to_string(image) + "\n"
     except Exception as e:
+        st.error(f"Error during OCR extraction: {e}")
     return text
+# Streamlit UI for document translation
+st.title("📚 Multilingual Document Translator with OCR Support")
 uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
 target_language = st.radio("Select target language for translation", ["English", "Urdu"])
 if uploaded_file:
+    file_path = f"/mnt/data/{uploaded_file.name}"
+    # OCR-based text extraction for PDFs
+    text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file)
     st.subheader("Extracted Text (Preview)")
     st.write(text_content[:500] if text_content else "No content found in the file.")
     if st.button("Translate"):
         if text_content:
             st.subheader(f"Translated Text ({target_language})")