tahirsher's picture
Update app.py
ded567d verified
raw
history blame
2.45 kB
import streamlit as st
import PyPDF2
import docx2txt
from transformers import pipeline
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
# Load translation models
def load_translation_models():
try:
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
return translator_en, translator_ur
except Exception as e:
st.error(f"Error initializing translation models: {e}")
return None, None
translator_en, translator_ur = load_translation_models()
def extract_text_from_pdf_with_ocr(file_path):
"""Extract text from image-based PDF using OCR."""
text = ""
try:
# Convert PDF to images
pages = convert_from_path(file_path, 300)
for page in pages:
image = Image.fromarray(page)
text += pytesseract.image_to_string(image) + "\n"
except Exception as e:
st.error(f"Error during OCR extraction: {e}")
return text
# Streamlit UI for document translation
st.title("πŸ“š Multilingual Document Translator with OCR Support")
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
if uploaded_file:
file_path = f"/mnt/data/{uploaded_file.name}"
# OCR-based text extraction for PDFs
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file)
st.subheader("Extracted Text (Preview)")
st.write(text_content[:500] if text_content else "No content found in the file.")
if st.button("Translate"):
if text_content:
st.subheader(f"Translated Text ({target_language})")
if target_language == "English" and translator_en:
translated_text = translate_text(text_content, translator_en)
elif target_language == "Urdu" and translator_ur:
translated_text = translate_text(text_content, translator_ur)
else:
st.warning("Translation model not loaded successfully.")
st.text_area("Translation Output", translated_text, height=300)
else:
st.warning("No text found to translate. Please upload a valid document.")