File size: 3,577 Bytes
ade99c4 f53330e ded567d f53330e ade99c4 f53330e 04f9dd5 f53330e 04f9dd5 f53330e ade99c4 ded567d f53330e ade99c4 464541c f53330e 464541c ded567d ade99c4 f53330e 464541c ade99c4 ded567d f53330e ded567d ade99c4 04f9dd5 464541c 04f9dd5 464541c 04f9dd5 464541c ade99c4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import torch
# Image-to-Text Model (TrOCR)
def load_image_to_text_model():
tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
return tokenizer, model
def extract_text_with_trocr(image, tokenizer, model):
"""Extract text from an image using TrOCR."""
pixel_values = tokenizer(image, return_tensors="pt").pixel_values
outputs = model.generate(pixel_values)
return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
# Multilingual Translation Models
def load_translation_models():
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
return translator_en, translator_ur
translator_en, translator_ur = load_translation_models()
tokenizer, trocr_model = load_image_to_text_model()
def extract_text_from_pdf_with_ocr(file_path):
"""Extract text from image-based PDF using TrOCR."""
text = ""
try:
pages = convert_from_path(file_path, 300)
for page_image in pages:
text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n"
except Exception as e:
st.error(f"Error during OCR extraction: {e}")
return text
# Translation Function
def translate_text(text, translator):
"""Translate text into the selected language."""
max_chunk_size = 512
text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
translations = []
for chunk in text_chunks:
try:
result = translator(chunk)
translations.append(result[0]['translation_text'])
except Exception as e:
st.error(f"Error during translation: {e}")
return ""
return " ".join(translations)
# Streamlit UI
st.title("📚 Image-Based Document Translator with TrOCR and Translation Models")
st.write("Translate image-based PDF or image files using advanced models.")
uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
if uploaded_file:
file_path = f"/mnt/data/{uploaded_file.name}"
# Image-based PDF processing using TrOCR
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model)
st.subheader("Extracted Text (Preview)")
st.write(text_content[:500] if text_content else "No content found in the file.")
if st.button("Translate"):
if text_content:
st.subheader(f"Translated Text ({target_language})")
if target_language == "English" and translator_en:
translated_text = translate_text(text_content, translator_en)
elif target_language == "Urdu" and translator_ur:
translated_text = translate_text(text_content, translator_ur)
else:
st.warning("Translation model not loaded successfully.")
st.text_area("Translation Output", translated_text, height=300)
else:
st.warning("No text found to translate. Please upload a valid document.")
|