|
import streamlit as st |
|
from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText |
|
import pytesseract |
|
from pdf2image import convert_from_path |
|
from PIL import Image |
|
import torch |
|
|
|
|
|
def load_image_to_text_model(): |
|
tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed") |
|
model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed") |
|
return tokenizer, model |
|
|
|
def extract_text_with_trocr(image, tokenizer, model): |
|
"""Extract text from an image using TrOCR.""" |
|
pixel_values = tokenizer(image, return_tensors="pt").pixel_values |
|
outputs = model.generate(pixel_values) |
|
return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] |
|
|
|
|
|
def load_translation_models(): |
|
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt") |
|
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt") |
|
return translator_en, translator_ur |
|
|
|
translator_en, translator_ur = load_translation_models() |
|
tokenizer, trocr_model = load_image_to_text_model() |
|
|
|
def extract_text_from_pdf_with_ocr(file_path): |
|
"""Extract text from image-based PDF using TrOCR.""" |
|
text = "" |
|
try: |
|
pages = convert_from_path(file_path, 300) |
|
for page_image in pages: |
|
text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n" |
|
except Exception as e: |
|
st.error(f"Error during OCR extraction: {e}") |
|
return text |
|
|
|
|
|
def translate_text(text, translator): |
|
"""Translate text into the selected language.""" |
|
max_chunk_size = 512 |
|
text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)] |
|
translations = [] |
|
|
|
for chunk in text_chunks: |
|
try: |
|
result = translator(chunk) |
|
translations.append(result[0]['translation_text']) |
|
except Exception as e: |
|
st.error(f"Error during translation: {e}") |
|
return "" |
|
return " ".join(translations) |
|
|
|
|
|
st.title("π Image-Based Document Translator with TrOCR and Translation Models") |
|
st.write("Translate image-based PDF or image files using advanced models.") |
|
|
|
uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"]) |
|
target_language = st.radio("Select target language for translation", ["English", "Urdu"]) |
|
|
|
if uploaded_file: |
|
file_path = f"/mnt/data/{uploaded_file.name}" |
|
|
|
|
|
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model) |
|
|
|
st.subheader("Extracted Text (Preview)") |
|
st.write(text_content[:500] if text_content else "No content found in the file.") |
|
|
|
if st.button("Translate"): |
|
if text_content: |
|
st.subheader(f"Translated Text ({target_language})") |
|
if target_language == "English" and translator_en: |
|
translated_text = translate_text(text_content, translator_en) |
|
elif target_language == "Urdu" and translator_ur: |
|
translated_text = translate_text(text_content, translator_ur) |
|
else: |
|
st.warning("Translation model not loaded successfully.") |
|
|
|
st.text_area("Translation Output", translated_text, height=300) |
|
else: |
|
st.warning("No text found to translate. Please upload a valid document.") |
|
|