Update app.py
Browse files
app.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
import fitz
|
2 |
from PIL import Image
|
3 |
from transformers import pipeline
|
4 |
import streamlit as st
|
@@ -10,10 +10,12 @@ from langdetect import detect
|
|
10 |
trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
|
11 |
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
|
12 |
|
|
|
13 |
def extract_text_from_image(image):
|
14 |
result = trocr_pipeline(image)
|
15 |
return result[0]['generated_text'] if result else ""
|
16 |
|
|
|
17 |
def extract_from_pdf(pdf_path):
|
18 |
doc = fitz.open(pdf_path)
|
19 |
full_text = ""
|
@@ -22,6 +24,7 @@ def extract_from_pdf(pdf_path):
|
|
22 |
full_text += page.get_text() + "\n"
|
23 |
return full_text.strip()
|
24 |
|
|
|
25 |
def extract_from_word(docx_path):
|
26 |
doc = Document(docx_path)
|
27 |
full_text = ""
|
@@ -29,9 +32,11 @@ def extract_from_word(docx_path):
|
|
29 |
full_text += para.text + "\n"
|
30 |
return full_text.strip()
|
31 |
|
|
|
32 |
def clean_text(text):
|
33 |
return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
|
34 |
|
|
|
35 |
def translate_text(text):
|
36 |
if not text.strip():
|
37 |
return "No text available for translation."
|
@@ -50,12 +55,22 @@ def translate_text(text):
|
|
50 |
translated_text += translated_chunk[0]['translation_text'] + " "
|
51 |
return translated_text.strip()
|
52 |
|
|
|
53 |
def create_pdf(translated_text, output_path):
|
|
|
54 |
doc = fitz.open()
|
55 |
page = doc.new_page()
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
doc.save(output_path)
|
58 |
|
|
|
59 |
st.title("Multilingual Document Translator")
|
60 |
uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
|
61 |
|
@@ -86,18 +101,21 @@ if uploaded_file is not None:
|
|
86 |
st.subheader("Translated Text (English)")
|
87 |
st.write(translated_text)
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
99 |
finally:
|
100 |
if os.path.exists(temp_file_path):
|
101 |
os.remove(temp_file_path)
|
102 |
-
if os.path.exists(
|
103 |
-
os.remove(
|
|
|
1 |
+
import fitz # PyMuPDF for PDF processing
|
2 |
from PIL import Image
|
3 |
from transformers import pipeline
|
4 |
import streamlit as st
|
|
|
10 |
trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
|
11 |
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
|
12 |
|
13 |
+
|
14 |
def extract_text_from_image(image):
|
15 |
result = trocr_pipeline(image)
|
16 |
return result[0]['generated_text'] if result else ""
|
17 |
|
18 |
+
|
19 |
def extract_from_pdf(pdf_path):
|
20 |
doc = fitz.open(pdf_path)
|
21 |
full_text = ""
|
|
|
24 |
full_text += page.get_text() + "\n"
|
25 |
return full_text.strip()
|
26 |
|
27 |
+
|
28 |
def extract_from_word(docx_path):
|
29 |
doc = Document(docx_path)
|
30 |
full_text = ""
|
|
|
32 |
full_text += para.text + "\n"
|
33 |
return full_text.strip()
|
34 |
|
35 |
+
|
36 |
def clean_text(text):
|
37 |
return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
|
38 |
|
39 |
+
|
40 |
def translate_text(text):
|
41 |
if not text.strip():
|
42 |
return "No text available for translation."
|
|
|
55 |
translated_text += translated_chunk[0]['translation_text'] + " "
|
56 |
return translated_text.strip()
|
57 |
|
58 |
+
|
59 |
def create_pdf(translated_text, output_path):
|
60 |
+
# Ensure translated text is inserted into PDF properly
|
61 |
doc = fitz.open()
|
62 |
page = doc.new_page()
|
63 |
+
wrapped_text = fitz.TextWriter(page.rect)
|
64 |
+
|
65 |
+
# Properly format text insertion for multiline text
|
66 |
+
lines = translated_text.split("\n")
|
67 |
+
for idx, line in enumerate(lines):
|
68 |
+
y_position = 50 + (idx * 15) # Adjust line spacing (15pt between lines)
|
69 |
+
wrapped_text.append((50, y_position), line, fontsize=10, fontname="helv")
|
70 |
+
|
71 |
doc.save(output_path)
|
72 |
|
73 |
+
|
74 |
st.title("Multilingual Document Translator")
|
75 |
uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
|
76 |
|
|
|
101 |
st.subheader("Translated Text (English)")
|
102 |
st.write(translated_text)
|
103 |
|
104 |
+
if translated_text.strip():
|
105 |
+
output_pdf_path = "translated_document.pdf"
|
106 |
+
create_pdf(translated_text, output_pdf_path)
|
107 |
+
|
108 |
+
with open(output_pdf_path, "rb") as f:
|
109 |
+
st.download_button(
|
110 |
+
label="Download Translated PDF",
|
111 |
+
data=f,
|
112 |
+
file_name="translated_document.pdf",
|
113 |
+
mime="application/pdf"
|
114 |
+
)
|
115 |
+
else:
|
116 |
+
st.warning("No content to save in the translated PDF.")
|
117 |
finally:
|
118 |
if os.path.exists(temp_file_path):
|
119 |
os.remove(temp_file_path)
|
120 |
+
if os.path.exists("translated_document.pdf"):
|
121 |
+
os.remove("translated_document.pdf")
|