tahirsher commited on
Commit
b57bd69
·
verified ·
1 Parent(s): 5bb4750

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -14
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import fitz
2
  from PIL import Image
3
  from transformers import pipeline
4
  import streamlit as st
@@ -10,10 +10,12 @@ from langdetect import detect
10
  trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
11
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
12
 
 
13
  def extract_text_from_image(image):
14
  result = trocr_pipeline(image)
15
  return result[0]['generated_text'] if result else ""
16
 
 
17
  def extract_from_pdf(pdf_path):
18
  doc = fitz.open(pdf_path)
19
  full_text = ""
@@ -22,6 +24,7 @@ def extract_from_pdf(pdf_path):
22
  full_text += page.get_text() + "\n"
23
  return full_text.strip()
24
 
 
25
  def extract_from_word(docx_path):
26
  doc = Document(docx_path)
27
  full_text = ""
@@ -29,9 +32,11 @@ def extract_from_word(docx_path):
29
  full_text += para.text + "\n"
30
  return full_text.strip()
31
 
 
32
  def clean_text(text):
33
  return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
34
 
 
35
  def translate_text(text):
36
  if not text.strip():
37
  return "No text available for translation."
@@ -50,12 +55,22 @@ def translate_text(text):
50
  translated_text += translated_chunk[0]['translation_text'] + " "
51
  return translated_text.strip()
52
 
 
53
  def create_pdf(translated_text, output_path):
 
54
  doc = fitz.open()
55
  page = doc.new_page()
56
- page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv")
 
 
 
 
 
 
 
57
  doc.save(output_path)
58
 
 
59
  st.title("Multilingual Document Translator")
60
  uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
61
 
@@ -86,18 +101,21 @@ if uploaded_file is not None:
86
  st.subheader("Translated Text (English)")
87
  st.write(translated_text)
88
 
89
- output_pdf_path = "translated_document.pdf"
90
- create_pdf(translated_text, output_pdf_path)
91
-
92
- with open(output_pdf_path, "rb") as f:
93
- st.download_button(
94
- label="Download Translated PDF",
95
- data=f,
96
- file_name="translated_document.pdf",
97
- mime="application/pdf"
98
- )
 
 
 
99
  finally:
100
  if os.path.exists(temp_file_path):
101
  os.remove(temp_file_path)
102
- if os.path.exists(output_pdf_path):
103
- os.remove(output_pdf_path)
 
1
+ import fitz # PyMuPDF for PDF processing
2
  from PIL import Image
3
  from transformers import pipeline
4
  import streamlit as st
 
10
  trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
11
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
12
 
13
+
14
  def extract_text_from_image(image):
15
  result = trocr_pipeline(image)
16
  return result[0]['generated_text'] if result else ""
17
 
18
+
19
  def extract_from_pdf(pdf_path):
20
  doc = fitz.open(pdf_path)
21
  full_text = ""
 
24
  full_text += page.get_text() + "\n"
25
  return full_text.strip()
26
 
27
+
28
  def extract_from_word(docx_path):
29
  doc = Document(docx_path)
30
  full_text = ""
 
32
  full_text += para.text + "\n"
33
  return full_text.strip()
34
 
35
+
36
  def clean_text(text):
37
  return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
38
 
39
+
40
  def translate_text(text):
41
  if not text.strip():
42
  return "No text available for translation."
 
55
  translated_text += translated_chunk[0]['translation_text'] + " "
56
  return translated_text.strip()
57
 
58
+
59
  def create_pdf(translated_text, output_path):
60
+ # Ensure translated text is inserted into PDF properly
61
  doc = fitz.open()
62
  page = doc.new_page()
63
+ wrapped_text = fitz.TextWriter(page.rect)
64
+
65
+ # Properly format text insertion for multiline text
66
+ lines = translated_text.split("\n")
67
+ for idx, line in enumerate(lines):
68
+ y_position = 50 + (idx * 15) # Adjust line spacing (15pt between lines)
69
+ wrapped_text.append((50, y_position), line, fontsize=10, fontname="helv")
70
+
71
  doc.save(output_path)
72
 
73
+
74
  st.title("Multilingual Document Translator")
75
  uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
76
 
 
101
  st.subheader("Translated Text (English)")
102
  st.write(translated_text)
103
 
104
+ if translated_text.strip():
105
+ output_pdf_path = "translated_document.pdf"
106
+ create_pdf(translated_text, output_pdf_path)
107
+
108
+ with open(output_pdf_path, "rb") as f:
109
+ st.download_button(
110
+ label="Download Translated PDF",
111
+ data=f,
112
+ file_name="translated_document.pdf",
113
+ mime="application/pdf"
114
+ )
115
+ else:
116
+ st.warning("No content to save in the translated PDF.")
117
  finally:
118
  if os.path.exists(temp_file_path):
119
  os.remove(temp_file_path)
120
+ if os.path.exists("translated_document.pdf"):
121
+ os.remove("translated_document.pdf")