tahirsher commited on
Commit
5bb4750
·
verified ·
1 Parent(s): 63f5b6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -25
app.py CHANGED
@@ -1,23 +1,19 @@
1
- import fitz # PyMuPDF for PDF processing
2
- from PIL import Image # For image processing
3
  from transformers import pipeline
4
  import streamlit as st
5
  import os
6
  import re
7
- from docx import Document # For Word document processing
 
8
 
9
- # Load the TrOCR model for image-to-text (smaller model)
10
  trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
11
-
12
- # Load the translation model (smaller model)
13
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
14
 
15
- # Function to extract text from an image using TrOCR
16
  def extract_text_from_image(image):
17
  result = trocr_pipeline(image)
18
  return result[0]['generated_text'] if result else ""
19
 
20
- # Function to extract text from a PDF (optimized for performance)
21
  def extract_from_pdf(pdf_path):
22
  doc = fitz.open(pdf_path)
23
  full_text = ""
@@ -26,7 +22,6 @@ def extract_from_pdf(pdf_path):
26
  full_text += page.get_text() + "\n"
27
  return full_text.strip()
28
 
29
- # Function to extract text from a Word document
30
  def extract_from_word(docx_path):
31
  doc = Document(docx_path)
32
  full_text = ""
@@ -34,42 +29,44 @@ def extract_from_word(docx_path):
34
  full_text += para.text + "\n"
35
  return full_text.strip()
36
 
37
- # Function to clean extracted text
38
  def clean_text(text):
39
  return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
40
 
41
- # Function to translate text to English (batched for performance)
42
  def translate_text(text):
 
 
 
 
 
 
 
 
 
43
  chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
44
  translated_text = ""
45
  for chunk in chunks:
46
- if chunk.strip():
47
- translated_chunk = translator(chunk, max_length=400)
48
- if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
49
- translated_text += translated_chunk[0]['translation_text'] + " "
50
  return translated_text.strip()
51
 
52
- # Function to create a PDF from translated text
53
  def create_pdf(translated_text, output_path):
54
  doc = fitz.open()
55
  page = doc.new_page()
56
  page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv")
57
  doc.save(output_path)
58
 
59
- # Streamlit UI
60
  st.title("Multilingual Document Translator")
61
  uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
62
 
63
  if uploaded_file is not None:
64
  with st.spinner("Processing document..."):
65
- # Save the uploaded file temporarily
66
  file_extension = uploaded_file.name.split(".")[-1].lower()
67
  temp_file_path = f"temp.{file_extension}"
68
  with open(temp_file_path, "wb") as f:
69
  f.write(uploaded_file.getbuffer())
70
 
71
  try:
72
- # Extract text based on file type
73
  if file_extension == "pdf":
74
  extracted_text = extract_from_pdf(temp_file_path)
75
  elif file_extension in ["jpg", "jpeg", "png"]:
@@ -81,21 +78,17 @@ if uploaded_file is not None:
81
  st.error("Unsupported file format.")
82
  st.stop()
83
 
84
- # Clean and translate the extracted text
85
  extracted_text = clean_text(extracted_text)
86
- st.write("Extracted Text for Debugging (First 500 characters):", extracted_text[:500])
87
 
88
  translated_text = translate_text(extracted_text)
89
 
90
- # Display the translated text
91
  st.subheader("Translated Text (English)")
92
  st.write(translated_text)
93
 
94
- # Create a PDF from the translated text
95
  output_pdf_path = "translated_document.pdf"
96
  create_pdf(translated_text, output_pdf_path)
97
 
98
- # Provide a download link for the translated PDF
99
  with open(output_pdf_path, "rb") as f:
100
  st.download_button(
101
  label="Download Translated PDF",
@@ -104,7 +97,6 @@ if uploaded_file is not None:
104
  mime="application/pdf"
105
  )
106
  finally:
107
- # Clean up temporary files
108
  if os.path.exists(temp_file_path):
109
  os.remove(temp_file_path)
110
  if os.path.exists(output_pdf_path):
 
1
+ import fitz
2
+ from PIL import Image
3
  from transformers import pipeline
4
  import streamlit as st
5
  import os
6
  import re
7
+ from docx import Document
8
+ from langdetect import detect
9
 
 
10
  trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
 
 
11
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
12
 
 
13
  def extract_text_from_image(image):
14
  result = trocr_pipeline(image)
15
  return result[0]['generated_text'] if result else ""
16
 
 
17
  def extract_from_pdf(pdf_path):
18
  doc = fitz.open(pdf_path)
19
  full_text = ""
 
22
  full_text += page.get_text() + "\n"
23
  return full_text.strip()
24
 
 
25
  def extract_from_word(docx_path):
26
  doc = Document(docx_path)
27
  full_text = ""
 
29
  full_text += para.text + "\n"
30
  return full_text.strip()
31
 
 
32
  def clean_text(text):
33
  return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
34
 
 
35
  def translate_text(text):
36
+ if not text.strip():
37
+ return "No text available for translation."
38
+
39
+ detected_language = detect(text)
40
+ st.write(f"Detected language: {detected_language}")
41
+
42
+ if detected_language == "en":
43
+ return "The text is already in English."
44
+
45
  chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
46
  translated_text = ""
47
  for chunk in chunks:
48
+ translated_chunk = translator(chunk, max_length=400)
49
+ if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
50
+ translated_text += translated_chunk[0]['translation_text'] + " "
 
51
  return translated_text.strip()
52
 
 
53
  def create_pdf(translated_text, output_path):
54
  doc = fitz.open()
55
  page = doc.new_page()
56
  page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv")
57
  doc.save(output_path)
58
 
 
59
  st.title("Multilingual Document Translator")
60
  uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
61
 
62
  if uploaded_file is not None:
63
  with st.spinner("Processing document..."):
 
64
  file_extension = uploaded_file.name.split(".")[-1].lower()
65
  temp_file_path = f"temp.{file_extension}"
66
  with open(temp_file_path, "wb") as f:
67
  f.write(uploaded_file.getbuffer())
68
 
69
  try:
 
70
  if file_extension == "pdf":
71
  extracted_text = extract_from_pdf(temp_file_path)
72
  elif file_extension in ["jpg", "jpeg", "png"]:
 
78
  st.error("Unsupported file format.")
79
  st.stop()
80
 
 
81
  extracted_text = clean_text(extracted_text)
82
+ st.write("Extracted Text (First 500 characters):", extracted_text[:500])
83
 
84
  translated_text = translate_text(extracted_text)
85
 
 
86
  st.subheader("Translated Text (English)")
87
  st.write(translated_text)
88
 
 
89
  output_pdf_path = "translated_document.pdf"
90
  create_pdf(translated_text, output_pdf_path)
91
 
 
92
  with open(output_pdf_path, "rb") as f:
93
  st.download_button(
94
  label="Download Translated PDF",
 
97
  mime="application/pdf"
98
  )
99
  finally:
 
100
  if os.path.exists(temp_file_path):
101
  os.remove(temp_file_path)
102
  if os.path.exists(output_pdf_path):