|
import streamlit as st |
|
import PyPDF2 |
|
import docx2txt |
|
from transformers import pipeline |
|
import pytesseract |
|
from pdf2image import convert_from_path |
|
from PIL import Image |
|
|
|
|
|
def load_translation_models(): |
|
try: |
|
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt") |
|
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt") |
|
return translator_en, translator_ur |
|
except Exception as e: |
|
st.error(f"Error initializing translation models: {e}") |
|
return None, None |
|
|
|
translator_en, translator_ur = load_translation_models() |
|
|
|
def extract_text_from_pdf_with_ocr(file_path): |
|
"""Extract text from image-based PDF using OCR.""" |
|
text = "" |
|
try: |
|
|
|
pages = convert_from_path(file_path, 300) |
|
for page in pages: |
|
image = Image.fromarray(page) |
|
text += pytesseract.image_to_string(image) + "\n" |
|
except Exception as e: |
|
st.error(f"Error during OCR extraction: {e}") |
|
return text |
|
|
|
|
|
st.title("π Multilingual Document Translator with OCR Support") |
|
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"]) |
|
target_language = st.radio("Select target language for translation", ["English", "Urdu"]) |
|
|
|
if uploaded_file: |
|
file_path = f"/mnt/data/{uploaded_file.name}" |
|
|
|
|
|
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file) |
|
|
|
st.subheader("Extracted Text (Preview)") |
|
st.write(text_content[:500] if text_content else "No content found in the file.") |
|
|
|
if st.button("Translate"): |
|
if text_content: |
|
st.subheader(f"Translated Text ({target_language})") |
|
if target_language == "English" and translator_en: |
|
translated_text = translate_text(text_content, translator_en) |
|
elif target_language == "Urdu" and translator_ur: |
|
translated_text = translate_text(text_content, translator_ur) |
|
else: |
|
st.warning("Translation model not loaded successfully.") |
|
|
|
st.text_area("Translation Output", translated_text, height=300) |
|
else: |
|
st.warning("No text found to translate. Please upload a valid document.") |
|
|