Spaces:

tahirsher
/

Multilingual_Translator-English-Urdu

Sleeping

App Files Files Community

tahirsher commited on Jan 28

Commit

464541c

verified ·

1 Parent(s): 00053f3

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -25

app.py CHANGED Viewed

@@ -3,40 +3,56 @@ import PyPDF2
 import docx2txt
 from transformers import pipeline
-# Hugging Face translation pipeline
-translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")  # Multilingual to English
-translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-ur")  # Multilingual to Urdu
 def extract_text_from_pdf(file):
-    """Extract text from a PDF file."""
-    pdf_reader = PyPDF2.PdfReader(file)
     text = ""
-    for page in pdf_reader.pages:
-        text += page.extract_text()
     return text
 def extract_text_from_word(file):
-    """Extract text from a Word file."""
-    return docx2txt.process(file)
-def translate_text(text, target_language):
-    """Translate text to the selected language."""
-    if target_language == "English":
-        return translator_en(text[:500]) if text else "No text found"
-    elif target_language == "Urdu":
-        return translator_ur(text[:500]) if text else "No text found"
-    return "Invalid translation choice."
 # Streamlit UI
-st.title("Multilingual Document Translator")
-st.write("Translate PDF or Word documents to English and Urdu quickly.")
-# File uploader
 uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
-target_language = st.radio("Select the target language for translation", ["English", "Urdu"])
 if uploaded_file:
-    # Extract text
     if uploaded_file.name.endswith(".pdf"):
         text_content = extract_text_from_pdf(uploaded_file)
     else:
@@ -46,12 +62,15 @@ if uploaded_file:
     st.subheader("Extracted Text (Preview)")
     st.write(text_content[:500] if text_content else "No content found in the file.")
-    # Perform translation
     if st.button("Translate"):
         if text_content:
             st.subheader(f"Translated Text ({target_language})")
-            translation_results = translate_text(text_content, target_language)
-            translations = "\n".join([result['translation_text'] for result in translation_results])
-            st.text_area("Translation Output", translations, height=300)
         else:
             st.warning("No text found to translate. Please upload a valid document.")

 import docx2txt
 from transformers import pipeline
+# Initialize Hugging Face Translation Pipelines (Force PyTorch Backend)
+try:
+    translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
+    translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-ur", framework="pt")
+except Exception as e:
+    st.error(f"Failed to initialize translation models. Error: {e}")
 def extract_text_from_pdf(file):
+    """Extract text from PDF."""
     text = ""
+    try:
+        pdf_reader = PyPDF2.PdfReader(file)
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    except Exception as e:
+        st.error(f"Error extracting text from PDF: {e}")
     return text
 def extract_text_from_word(file):
+    """Extract text from Word file."""
+    try:
+        return docx2txt.process(file)
+    except Exception as e:
+        st.error(f"Error extracting text from Word document: {e}")
+        return ""
+def translate_text(text, translator):
+    """Translate text in chunks using the given translator."""
+    max_chunk_size = 512  # Limit due to token constraints
+    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
+    translations = []
+    for chunk in text_chunks:
+        try:
+            result = translator(chunk)
+            translations.append(result[0]['translation_text'])
+        except Exception as e:
+            st.error(f"Error during translation: {e}")
+            return ""
+    return " ".join(translations)
 # Streamlit UI
+st.title("📚 Multilingual Document Translator")
+st.write("Translate PDF or Word documents to English and Urdu effortlessly!")
 uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
+target_language = st.radio("Select target language for translation", ["English", "Urdu"])
 if uploaded_file:
+    # Extract text from the uploaded file
     if uploaded_file.name.endswith(".pdf"):
         text_content = extract_text_from_pdf(uploaded_file)
     else:
     st.subheader("Extracted Text (Preview)")
     st.write(text_content[:500] if text_content else "No content found in the file.")
+    # Perform translation when the user clicks the button
     if st.button("Translate"):
         if text_content:
             st.subheader(f"Translated Text ({target_language})")
+            if target_language == "English":
+                translated_text = translate_text(text_content, translator_en)
+            else:
+                translated_text = translate_text(text_content, translator_ur)
+            st.text_area("Translation Output", translated_text, height=300)
         else:
             st.warning("No text found to translate. Please upload a valid document.")