Update app.py
Browse files
app.py
CHANGED
@@ -2,11 +2,12 @@ import streamlit as st
|
|
2 |
import PyPDF2
|
3 |
import docx2txt
|
4 |
from transformers import pipeline
|
5 |
-
import
|
|
|
|
|
6 |
|
7 |
# Load translation models
|
8 |
def load_translation_models():
|
9 |
-
"""Load translation models for English and Urdu."""
|
10 |
try:
|
11 |
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
|
12 |
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
|
@@ -17,59 +18,33 @@ def load_translation_models():
|
|
17 |
|
18 |
translator_en, translator_ur = load_translation_models()
|
19 |
|
20 |
-
def
|
21 |
-
"""Extract text from
|
22 |
text = ""
|
23 |
try:
|
24 |
-
|
25 |
-
|
26 |
-
|
|
|
|
|
27 |
except Exception as e:
|
28 |
-
st.error(f"Error
|
29 |
return text
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
try:
|
34 |
-
return docx2txt.process(file)
|
35 |
-
except Exception as e:
|
36 |
-
st.error(f"Error extracting text from Word document: {e}")
|
37 |
-
return ""
|
38 |
-
|
39 |
-
def translate_text(text, translator):
|
40 |
-
"""Translate text in manageable chunks."""
|
41 |
-
max_chunk_size = 512
|
42 |
-
text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
|
43 |
-
translations = []
|
44 |
-
|
45 |
-
for chunk in text_chunks:
|
46 |
-
try:
|
47 |
-
result = translator(chunk)
|
48 |
-
translations.append(result[0]['translation_text'])
|
49 |
-
except Exception as e:
|
50 |
-
st.error(f"Error during translation: {e}")
|
51 |
-
return ""
|
52 |
-
return " ".join(translations)
|
53 |
-
|
54 |
-
# Streamlit UI
|
55 |
-
st.title("📚 Multilingual Document Translator")
|
56 |
-
st.write("Translate PDF or Word documents to English and Urdu effortlessly!")
|
57 |
-
|
58 |
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
|
59 |
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
|
60 |
|
61 |
if uploaded_file:
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
else
|
66 |
-
|
67 |
-
|
68 |
-
# Show extracted text preview
|
69 |
st.subheader("Extracted Text (Preview)")
|
70 |
st.write(text_content[:500] if text_content else "No content found in the file.")
|
71 |
|
72 |
-
# Perform translation when the user clicks the button
|
73 |
if st.button("Translate"):
|
74 |
if text_content:
|
75 |
st.subheader(f"Translated Text ({target_language})")
|
|
|
2 |
import PyPDF2
|
3 |
import docx2txt
|
4 |
from transformers import pipeline
|
5 |
+
import pytesseract
|
6 |
+
from pdf2image import convert_from_path
|
7 |
+
from PIL import Image
|
8 |
|
9 |
# Load translation models
|
10 |
def load_translation_models():
|
|
|
11 |
try:
|
12 |
translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
|
13 |
translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
|
|
|
18 |
|
19 |
translator_en, translator_ur = load_translation_models()
|
20 |
|
21 |
+
def extract_text_from_pdf_with_ocr(file_path):
|
22 |
+
"""Extract text from image-based PDF using OCR."""
|
23 |
text = ""
|
24 |
try:
|
25 |
+
# Convert PDF to images
|
26 |
+
pages = convert_from_path(file_path, 300)
|
27 |
+
for page in pages:
|
28 |
+
image = Image.fromarray(page)
|
29 |
+
text += pytesseract.image_to_string(image) + "\n"
|
30 |
except Exception as e:
|
31 |
+
st.error(f"Error during OCR extraction: {e}")
|
32 |
return text
|
33 |
|
34 |
+
# Streamlit UI for document translation
|
35 |
+
st.title("📚 Multilingual Document Translator with OCR Support")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
|
37 |
target_language = st.radio("Select target language for translation", ["English", "Urdu"])
|
38 |
|
39 |
if uploaded_file:
|
40 |
+
file_path = f"/mnt/data/{uploaded_file.name}"
|
41 |
+
|
42 |
+
# OCR-based text extraction for PDFs
|
43 |
+
text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file)
|
44 |
+
|
|
|
|
|
45 |
st.subheader("Extracted Text (Preview)")
|
46 |
st.write(text_content[:500] if text_content else "No content found in the file.")
|
47 |
|
|
|
48 |
if st.button("Translate"):
|
49 |
if text_content:
|
50 |
st.subheader(f"Translated Text ({target_language})")
|