Update app.py
Browse files
app.py
CHANGED
@@ -1,46 +1,70 @@
|
|
1 |
import streamlit as st
|
2 |
-
import
|
3 |
-
from transformers import pipeline
|
4 |
import pytesseract
|
5 |
from pdf2image import convert_from_path
|
6 |
from PIL import Image
|
7 |
-
import
|
8 |
|
9 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
def load_translation_models():
|
11 |
-
"""
|
12 |
-
|
13 |
-
|
14 |
-
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
|
15 |
-
return translator_en, translator_ur
|
16 |
-
except Exception as e:
|
17 |
-
st.error(f"Error initializing translation models: {e}")
|
18 |
-
return None, None
|
19 |
|
20 |
translator_en, translator_ur = load_translation_models()
|
|
|
21 |
|
22 |
def extract_text_from_pdf_with_ocr(file_path):
|
23 |
-
"""Extract text from image-based PDF using
|
24 |
text = ""
|
25 |
try:
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
text += pytesseract.image_to_string(page) + "\n"
|
30 |
except Exception as e:
|
31 |
st.error(f"Error during OCR extraction: {e}")
|
32 |
return text
|
33 |
|
34 |
-
#
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
|
38 |
|
39 |
if uploaded_file:
|
40 |
file_path = f"/mnt/data/{uploaded_file.name}"
|
41 |
-
|
42 |
-
#
|
43 |
-
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else
|
44 |
|
45 |
st.subheader("Extracted Text (Preview)")
|
46 |
st.write(text_content[:500] if text_content else "No content found in the file.")
|
|
|
1 |
import streamlit as st
|
2 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
|
|
|
3 |
import pytesseract
|
4 |
from pdf2image import convert_from_path
|
5 |
from PIL import Image
|
6 |
+
import torch
|
7 |
|
8 |
+
# Image-to-Text Model (TrOCR)
|
9 |
+
def load_image_to_text_model():
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
|
11 |
+
model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
|
12 |
+
return tokenizer, model
|
13 |
+
|
14 |
+
def extract_text_with_trocr(image, tokenizer, model):
|
15 |
+
"""Extract text from an image using TrOCR."""
|
16 |
+
pixel_values = tokenizer(image, return_tensors="pt").pixel_values
|
17 |
+
outputs = model.generate(pixel_values)
|
18 |
+
return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
19 |
+
|
20 |
+
# Multilingual Translation Models
|
21 |
def load_translation_models():
|
22 |
+
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
|
23 |
+
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
|
24 |
+
return translator_en, translator_ur
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
translator_en, translator_ur = load_translation_models()
|
27 |
+
tokenizer, trocr_model = load_image_to_text_model()
|
28 |
|
29 |
def extract_text_from_pdf_with_ocr(file_path):
|
30 |
+
"""Extract text from image-based PDF using TrOCR."""
|
31 |
text = ""
|
32 |
try:
|
33 |
+
pages = convert_from_path(file_path, 300)
|
34 |
+
for page_image in pages:
|
35 |
+
text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n"
|
|
|
36 |
except Exception as e:
|
37 |
st.error(f"Error during OCR extraction: {e}")
|
38 |
return text
|
39 |
|
40 |
+
# Translation Function
|
41 |
+
def translate_text(text, translator):
|
42 |
+
"""Translate text into the selected language."""
|
43 |
+
max_chunk_size = 512
|
44 |
+
text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
|
45 |
+
translations = []
|
46 |
+
|
47 |
+
for chunk in text_chunks:
|
48 |
+
try:
|
49 |
+
result = translator(chunk)
|
50 |
+
translations.append(result[0]['translation_text'])
|
51 |
+
except Exception as e:
|
52 |
+
st.error(f"Error during translation: {e}")
|
53 |
+
return ""
|
54 |
+
return " ".join(translations)
|
55 |
+
|
56 |
+
# Streamlit UI
|
57 |
+
st.title("📚 Image-Based Document Translator with TrOCR and Translation Models")
|
58 |
+
st.write("Translate image-based PDF or image files using advanced models.")
|
59 |
+
|
60 |
+
uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"])
|
61 |
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
|
62 |
|
63 |
if uploaded_file:
|
64 |
file_path = f"/mnt/data/{uploaded_file.name}"
|
65 |
+
|
66 |
+
# Image-based PDF processing using TrOCR
|
67 |
+
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model)
|
68 |
|
69 |
st.subheader("Extracted Text (Preview)")
|
70 |
st.write(text_content[:500] if text_content else "No content found in the file.")
|