Update app.py
Browse files
app.py
CHANGED
|
@@ -1,46 +1,70 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
import
|
| 3 |
-
from transformers import pipeline
|
| 4 |
import pytesseract
|
| 5 |
from pdf2image import convert_from_path
|
| 6 |
from PIL import Image
|
| 7 |
-
import
|
| 8 |
|
| 9 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
def load_translation_models():
|
| 11 |
-
"""
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
|
| 15 |
-
return translator_en, translator_ur
|
| 16 |
-
except Exception as e:
|
| 17 |
-
st.error(f"Error initializing translation models: {e}")
|
| 18 |
-
return None, None
|
| 19 |
|
| 20 |
translator_en, translator_ur = load_translation_models()
|
|
|
|
| 21 |
|
| 22 |
def extract_text_from_pdf_with_ocr(file_path):
|
| 23 |
-
"""Extract text from image-based PDF using
|
| 24 |
text = ""
|
| 25 |
try:
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
text += pytesseract.image_to_string(page) + "\n"
|
| 30 |
except Exception as e:
|
| 31 |
st.error(f"Error during OCR extraction: {e}")
|
| 32 |
return text
|
| 33 |
|
| 34 |
-
#
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
|
| 38 |
|
| 39 |
if uploaded_file:
|
| 40 |
file_path = f"/mnt/data/{uploaded_file.name}"
|
| 41 |
-
|
| 42 |
-
#
|
| 43 |
-
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else
|
| 44 |
|
| 45 |
st.subheader("Extracted Text (Preview)")
|
| 46 |
st.write(text_content[:500] if text_content else "No content found in the file.")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
|
|
|
|
| 3 |
import pytesseract
|
| 4 |
from pdf2image import convert_from_path
|
| 5 |
from PIL import Image
|
| 6 |
+
import torch
|
| 7 |
|
| 8 |
+
# Image-to-Text Model (TrOCR)
|
| 9 |
+
def load_image_to_text_model():
|
| 10 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
|
| 11 |
+
model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
|
| 12 |
+
return tokenizer, model
|
| 13 |
+
|
| 14 |
+
def extract_text_with_trocr(image, tokenizer, model):
|
| 15 |
+
"""Extract text from an image using TrOCR."""
|
| 16 |
+
pixel_values = tokenizer(image, return_tensors="pt").pixel_values
|
| 17 |
+
outputs = model.generate(pixel_values)
|
| 18 |
+
return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
| 19 |
+
|
| 20 |
+
# Multilingual Translation Models
|
| 21 |
def load_translation_models():
|
| 22 |
+
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
|
| 23 |
+
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
|
| 24 |
+
return translator_en, translator_ur
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
translator_en, translator_ur = load_translation_models()
|
| 27 |
+
tokenizer, trocr_model = load_image_to_text_model()
|
| 28 |
|
| 29 |
def extract_text_from_pdf_with_ocr(file_path):
|
| 30 |
+
"""Extract text from image-based PDF using TrOCR."""
|
| 31 |
text = ""
|
| 32 |
try:
|
| 33 |
+
pages = convert_from_path(file_path, 300)
|
| 34 |
+
for page_image in pages:
|
| 35 |
+
text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n"
|
|
|
|
| 36 |
except Exception as e:
|
| 37 |
st.error(f"Error during OCR extraction: {e}")
|
| 38 |
return text
|
| 39 |
|
| 40 |
+
# Translation Function
|
| 41 |
+
def translate_text(text, translator):
|
| 42 |
+
"""Translate text into the selected language."""
|
| 43 |
+
max_chunk_size = 512
|
| 44 |
+
text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
|
| 45 |
+
translations = []
|
| 46 |
+
|
| 47 |
+
for chunk in text_chunks:
|
| 48 |
+
try:
|
| 49 |
+
result = translator(chunk)
|
| 50 |
+
translations.append(result[0]['translation_text'])
|
| 51 |
+
except Exception as e:
|
| 52 |
+
st.error(f"Error during translation: {e}")
|
| 53 |
+
return ""
|
| 54 |
+
return " ".join(translations)
|
| 55 |
+
|
| 56 |
+
# Streamlit UI
|
| 57 |
+
st.title("📚 Image-Based Document Translator with TrOCR and Translation Models")
|
| 58 |
+
st.write("Translate image-based PDF or image files using advanced models.")
|
| 59 |
+
|
| 60 |
+
uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"])
|
| 61 |
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
|
| 62 |
|
| 63 |
if uploaded_file:
|
| 64 |
file_path = f"/mnt/data/{uploaded_file.name}"
|
| 65 |
+
|
| 66 |
+
# Image-based PDF processing using TrOCR
|
| 67 |
+
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model)
|
| 68 |
|
| 69 |
st.subheader("Extracted Text (Preview)")
|
| 70 |
st.write(text_content[:500] if text_content else "No content found in the file.")
|