tahirsher commited on
Commit
63f5b6d
·
verified ·
1 Parent(s): b0b875d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -44
app.py CHANGED
@@ -3,9 +3,8 @@ from PIL import Image # For image processing
3
  from transformers import pipeline
4
  import streamlit as st
5
  import os
6
- import io
7
  from docx import Document # For Word document processing
8
- import asyncio # For asynchronous processing
9
 
10
  # Load the TrOCR model for image-to-text (smaller model)
11
  trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
@@ -16,7 +15,7 @@ translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
16
  # Function to extract text from an image using TrOCR
17
  def extract_text_from_image(image):
18
  result = trocr_pipeline(image)
19
- return result[0]['generated_text']
20
 
21
  # Function to extract text from a PDF (optimized for performance)
22
  def extract_from_pdf(pdf_path):
@@ -24,9 +23,8 @@ def extract_from_pdf(pdf_path):
24
  full_text = ""
25
  for page_num in range(len(doc)):
26
  page = doc.load_page(page_num)
27
- # Extract text directly from the page (faster than OCR for text-based PDFs)
28
  full_text += page.get_text() + "\n"
29
- return full_text
30
 
31
  # Function to extract text from a Word document
32
  def extract_from_word(docx_path):
@@ -34,16 +32,21 @@ def extract_from_word(docx_path):
34
  full_text = ""
35
  for para in doc.paragraphs:
36
  full_text += para.text + "\n"
37
- return full_text
 
 
 
 
38
 
39
  # Function to translate text to English (batched for performance)
40
  def translate_text(text):
41
- # Split text into smaller chunks for faster translation
42
  chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
43
  translated_text = ""
44
  for chunk in chunks:
45
- translated_chunk = translator(chunk, max_length=400)[0]['translation_text']
46
- translated_text += translated_chunk + " "
 
 
47
  return translated_text.strip()
48
 
49
  # Function to create a PDF from translated text
@@ -65,38 +68,44 @@ if uploaded_file is not None:
65
  with open(temp_file_path, "wb") as f:
66
  f.write(uploaded_file.getbuffer())
67
 
68
- # Extract text based on file type
69
- if file_extension == "pdf":
70
- extracted_text = extract_from_pdf(temp_file_path)
71
- elif file_extension in ["jpg", "jpeg", "png"]:
72
- image = Image.open(temp_file_path)
73
- extracted_text = extract_text_from_image(image)
74
- elif file_extension == "docx":
75
- extracted_text = extract_from_word(temp_file_path)
76
- else:
77
- st.error("Unsupported file format.")
78
- st.stop()
79
-
80
- # Translate the extracted text
81
- translated_text = translate_text(extracted_text)
82
-
83
- # Display the translated text
84
- st.subheader("Translated Text (English)")
85
- st.write(translated_text)
86
-
87
- # Create a PDF from the translated text
88
- output_pdf_path = "translated_document.pdf"
89
- create_pdf(translated_text, output_pdf_path)
90
-
91
- # Provide a download link for the translated PDF
92
- with open(output_pdf_path, "rb") as f:
93
- st.download_button(
94
- label="Download Translated PDF",
95
- data=f,
96
- file_name="translated_document.pdf",
97
- mime="application/pdf"
98
- )
99
-
100
- # Clean up temporary files
101
- os.remove(temp_file_path)
102
- os.remove(output_pdf_path)
 
 
 
 
 
 
 
3
  from transformers import pipeline
4
  import streamlit as st
5
  import os
6
+ import re
7
  from docx import Document # For Word document processing
 
8
 
9
  # Load the TrOCR model for image-to-text (smaller model)
10
  trocr_pipeline = pipeline("image-to-text", model="microsoft/trocr-base-printed")
 
15
  # Function to extract text from an image using TrOCR
16
  def extract_text_from_image(image):
17
  result = trocr_pipeline(image)
18
+ return result[0]['generated_text'] if result else ""
19
 
20
  # Function to extract text from a PDF (optimized for performance)
21
  def extract_from_pdf(pdf_path):
 
23
  full_text = ""
24
  for page_num in range(len(doc)):
25
  page = doc.load_page(page_num)
 
26
  full_text += page.get_text() + "\n"
27
+ return full_text.strip()
28
 
29
  # Function to extract text from a Word document
30
  def extract_from_word(docx_path):
 
32
  full_text = ""
33
  for para in doc.paragraphs:
34
  full_text += para.text + "\n"
35
+ return full_text.strip()
36
+
37
+ # Function to clean extracted text
38
+ def clean_text(text):
39
+ return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
40
 
41
  # Function to translate text to English (batched for performance)
42
  def translate_text(text):
 
43
  chunks = [text[i:i + 500] for i in range(0, len(text), 500)]
44
  translated_text = ""
45
  for chunk in chunks:
46
+ if chunk.strip():
47
+ translated_chunk = translator(chunk, max_length=400)
48
+ if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
49
+ translated_text += translated_chunk[0]['translation_text'] + " "
50
  return translated_text.strip()
51
 
52
  # Function to create a PDF from translated text
 
68
  with open(temp_file_path, "wb") as f:
69
  f.write(uploaded_file.getbuffer())
70
 
71
+ try:
72
+ # Extract text based on file type
73
+ if file_extension == "pdf":
74
+ extracted_text = extract_from_pdf(temp_file_path)
75
+ elif file_extension in ["jpg", "jpeg", "png"]:
76
+ image = Image.open(temp_file_path)
77
+ extracted_text = extract_text_from_image(image)
78
+ elif file_extension == "docx":
79
+ extracted_text = extract_from_word(temp_file_path)
80
+ else:
81
+ st.error("Unsupported file format.")
82
+ st.stop()
83
+
84
+ # Clean and translate the extracted text
85
+ extracted_text = clean_text(extracted_text)
86
+ st.write("Extracted Text for Debugging (First 500 characters):", extracted_text[:500])
87
+
88
+ translated_text = translate_text(extracted_text)
89
+
90
+ # Display the translated text
91
+ st.subheader("Translated Text (English)")
92
+ st.write(translated_text)
93
+
94
+ # Create a PDF from the translated text
95
+ output_pdf_path = "translated_document.pdf"
96
+ create_pdf(translated_text, output_pdf_path)
97
+
98
+ # Provide a download link for the translated PDF
99
+ with open(output_pdf_path, "rb") as f:
100
+ st.download_button(
101
+ label="Download Translated PDF",
102
+ data=f,
103
+ file_name="translated_document.pdf",
104
+ mime="application/pdf"
105
+ )
106
+ finally:
107
+ # Clean up temporary files
108
+ if os.path.exists(temp_file_path):
109
+ os.remove(temp_file_path)
110
+ if os.path.exists(output_pdf_path):
111
+ os.remove(output_pdf_path)