File size: 3,577 Bytes
ade99c4
f53330e
ded567d
 
 
f53330e
ade99c4
f53330e
 
 
 
 
 
 
 
 
 
 
 
 
04f9dd5
f53330e
 
 
04f9dd5
 
f53330e
ade99c4
ded567d
f53330e
ade99c4
464541c
f53330e
 
 
464541c
ded567d
ade99c4
 
f53330e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464541c
ade99c4
 
ded567d
f53330e
 
 
ded567d
ade99c4
 
 
 
 
 
04f9dd5
464541c
04f9dd5
464541c
04f9dd5
 
 
464541c
ade99c4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import streamlit as st
from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import torch

# Image-to-Text Model (TrOCR)
def load_image_to_text_model():
    tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
    model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
    return tokenizer, model

def extract_text_with_trocr(image, tokenizer, model):
    """Extract text from an image using TrOCR."""
    pixel_values = tokenizer(image, return_tensors="pt").pixel_values
    outputs = model.generate(pixel_values)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Multilingual Translation Models
def load_translation_models():
    translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")  
    translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
    return translator_en, translator_ur

translator_en, translator_ur = load_translation_models()
tokenizer, trocr_model = load_image_to_text_model()

def extract_text_from_pdf_with_ocr(file_path):
    """Extract text from image-based PDF using TrOCR."""
    text = ""
    try:
        pages = convert_from_path(file_path, 300)
        for page_image in pages:
            text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n"
    except Exception as e:
        st.error(f"Error during OCR extraction: {e}")
    return text

# Translation Function
def translate_text(text, translator):
    """Translate text into the selected language."""
    max_chunk_size = 512
    text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
    translations = []
    
    for chunk in text_chunks:
        try:
            result = translator(chunk)
            translations.append(result[0]['translation_text'])
        except Exception as e:
            st.error(f"Error during translation: {e}")
            return ""
    return " ".join(translations)

# Streamlit UI
st.title("📚 Image-Based Document Translator with TrOCR and Translation Models")
st.write("Translate image-based PDF or image files using advanced models.")

uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"])
target_language = st.radio("Select target language for translation", ["English", "Urdu"])

if uploaded_file:
    file_path = f"/mnt/data/{uploaded_file.name}"

    # Image-based PDF processing using TrOCR
    text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model)
    
    st.subheader("Extracted Text (Preview)")
    st.write(text_content[:500] if text_content else "No content found in the file.")

    if st.button("Translate"):
        if text_content:
            st.subheader(f"Translated Text ({target_language})")
            if target_language == "English" and translator_en:
                translated_text = translate_text(text_content, translator_en)
            elif target_language == "Urdu" and translator_ur:
                translated_text = translate_text(text_content, translator_ur)
            else:
                st.warning("Translation model not loaded successfully.")
            
            st.text_area("Translation Output", translated_text, height=300)
        else:
            st.warning("No text found to translate. Please upload a valid document.")