tahirsher's picture
Update app.py
59f49e8 verified
raw
history blame
3.19 kB
import streamlit as st
import PyPDF2
import docx2txt
from transformers import pipeline
import sentencepiece
# Load translation models
def load_translation_models():
"""Load translation models for English and Urdu."""
try:
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
return translator_en, translator_ur
except Exception as e:
st.error(f"Error initializing translation models: {e}")
return None, None
translator_en, translator_ur = load_translation_models()
def extract_text_from_pdf(file):
"""Extract text from a PDF file."""
text = ""
try:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() or ""
except Exception as e:
st.error(f"Error extracting text from PDF: {e}")
return text
def extract_text_from_word(file):
"""Extract text from a Word file."""
try:
return docx2txt.process(file)
except Exception as e:
st.error(f"Error extracting text from Word document: {e}")
return ""
def translate_text(text, translator):
"""Translate text in manageable chunks."""
max_chunk_size = 512
text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
translations = []
for chunk in text_chunks:
try:
result = translator(chunk)
translations.append(result[0]['translation_text'])
except Exception as e:
st.error(f"Error during translation: {e}")
return ""
return " ".join(translations)
# Streamlit UI
st.title("πŸ“š Multilingual Document Translator")
st.write("Translate PDF or Word documents to English and Urdu effortlessly!")
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
if uploaded_file:
# Extract text from the uploaded file
if uploaded_file.name.endswith(".pdf"):
text_content = extract_text_from_pdf(uploaded_file)
else:
text_content = extract_text_from_word(uploaded_file)
# Show extracted text preview
st.subheader("Extracted Text (Preview)")
st.write(text_content[:500] if text_content else "No content found in the file.")
# Perform translation when the user clicks the button
if st.button("Translate"):
if text_content:
st.subheader(f"Translated Text ({target_language})")
if target_language == "English" and translator_en:
translated_text = translate_text(text_content, translator_en)
elif target_language == "Urdu" and translator_ur:
translated_text = translate_text(text_content, translator_ur)
else:
st.warning("Translation model not loaded successfully.")
st.text_area("Translation Output", translated_text, height=300)
else:
st.warning("No text found to translate. Please upload a valid document.")