tahirsher commited on
Commit
28d4d28
·
verified ·
1 Parent(s): 91f9ddd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -125
app.py CHANGED
@@ -1,153 +1,117 @@
1
- import fitz # PyMuPDF for PDF processing
2
- from PIL import Image
3
  import pytesseract
4
- from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
5
  import streamlit as st
6
  import os
7
- import re
8
- from docx import Document
9
  from langdetect import detect
 
 
 
10
 
11
- # Initialize BLIP-2 model and processor for image-to-text
12
- processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
13
- model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
14
-
15
- # Initialize translation pipeline
16
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
17
 
18
- # Path to Tesseract executable for OCR
19
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
20
 
21
 
22
- def extract_text_from_image(image):
23
- """Extract text from image using OCR or BLIP-2."""
24
- # First try using BLIP-2
25
- image = image.convert("RGB")
26
- inputs = processor(images=image, return_tensors="pt")
27
- generated_ids = model.generate(**inputs)
28
- decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
29
-
30
- # Fallback to OCR if BLIP-2 extraction fails
31
- if not decoded_text.strip():
32
- decoded_text = pytesseract.image_to_string(image)
33
-
34
- return decoded_text.strip()
35
-
36
-
37
- def extract_from_pdf(pdf_path):
38
- """Extract text from PDF by combining direct extraction and OCR fallback."""
39
- doc = fitz.open(pdf_path)
40
- full_text = ""
41
-
42
- for page_num in range(len(doc)):
43
- page = doc.load_page(page_num)
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- # Try extracting text directly
46
- text = page.get_text()
47
-
48
- # If no text, fallback to OCR
49
- if not text.strip():
50
- pix = page.get_pixmap()
51
- image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
52
- text = extract_text_from_image(image)
53
-
54
- full_text += text + "\n"
55
- return full_text.strip()
56
-
57
-
58
- def extract_from_word(docx_path):
59
- doc = Document(docx_path)
60
- full_text = ""
61
- for para in doc.paragraphs:
62
- full_text += para.text + "\n"
63
- return full_text.strip()
64
-
65
-
66
- def clean_text(text):
67
- return re.sub(r'[\x00-\x1f\x7f-\x9f]', '', text).strip()
68
-
69
-
70
- def translate_text(text):
71
- if not text.strip():
72
- return "No text available for translation."
73
 
74
- detected_language = detect(text)
75
- st.write(f"Detected language: {detected_language}")
76
 
77
- if detected_language == "en":
78
- return "The text is already in English."
79
 
80
- chunks = [text[i:i + 50000] for i in range(0, len(text), 50000)]
81
- translated_text = ""
82
- for chunk in chunks:
83
- translated_chunk = translator(chunk, max_length=400)
84
- if isinstance(translated_chunk, list) and 'translation_text' in translated_chunk[0]:
85
- translated_text += translated_chunk[0]['translation_text'] + " "
86
- return translated_text.strip()
87
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- def create_pdf(translated_text, output_path):
90
- doc = fitz.open()
91
- page = doc.new_page()
92
-
93
- # Define text insertion rectangle
94
- rect = fitz.Rect(50, 50, 550, 750)
95
-
96
- # Insert text using the defined rectangle
97
- page.insert_textbox(
98
- rect, translated_text,
99
- fontsize=12,
100
- fontname="helv",
101
- color=(0, 0, 0),
102
- )
103
  doc.save(output_path)
104
 
105
 
106
- st.title("Multilingual Document Translator")
107
- uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
 
108
 
109
  if uploaded_file is not None:
110
- with st.spinner("Processing document..."):
111
- file_extension = uploaded_file.name.split(".")[-1].lower()
112
- temp_file_path = f"temp.{file_extension}"
113
  with open(temp_file_path, "wb") as f:
114
  f.write(uploaded_file.getbuffer())
115
 
116
  try:
117
- if file_extension == "pdf":
118
- extracted_text = extract_from_pdf(temp_file_path)
119
- elif file_extension in ["jpg", "jpeg", "png"]:
120
- image = Image.open(temp_file_path)
121
- extracted_text = extract_text_from_image(image)
122
- elif file_extension == "docx":
123
- extracted_text = extract_from_word(temp_file_path)
124
- else:
125
- st.error("Unsupported file format.")
126
- st.stop()
127
-
128
- extracted_text = clean_text(extracted_text)
129
- st.write("Extracted Text (First 50000 characters):", extracted_text[:50000])
130
-
131
- translated_text = translate_text(extracted_text)
132
-
133
- st.subheader("Translated Text (English)")
134
- st.write(translated_text)
135
-
136
- if translated_text.strip():
137
- output_pdf_path = "translated_document.pdf"
138
- create_pdf(translated_text, output_pdf_path)
139
-
140
- with open(output_pdf_path, "rb") as f:
141
- st.download_button(
142
- label="Download Translated PDF",
143
- data=f,
144
- file_name="translated_document.pdf",
145
- mime="application/pdf"
146
- )
147
- else:
148
- st.warning("No content to save in the translated PDF.")
149
  finally:
 
150
  if os.path.exists(temp_file_path):
151
  os.remove(temp_file_path)
152
- if os.path.exists("translated_document.pdf"):
153
- os.remove("translated_document.pdf")
 
1
+ import pdfplumber
 
2
  import pytesseract
3
+ from transformers import pipeline
4
  import streamlit as st
5
  import os
6
+ import docx
 
7
  from langdetect import detect
8
+ from PIL import Image
9
+ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
10
+ from docx.shared import Pt
11
 
12
+ # Load the translation model
 
 
 
 
13
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
14
 
15
+ # Ensure Tesseract path is set (modify for your environment)
16
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
17
 
18
 
19
+ def extract_text_blocks_from_pdf(pdf_path):
20
+ """Extract text blocks while preserving structure (tables, paragraphs) from the PDF."""
21
+ extracted_content = []
22
+
23
+ with pdfplumber.open(pdf_path) as pdf:
24
+ for page in pdf.pages:
25
+ # Extract tables
26
+ tables = page.extract_tables()
27
+ for table in tables:
28
+ extracted_content.append({"type": "table", "content": table})
29
+
30
+ # Extract plain text
31
+ text_blocks = page.extract_text()
32
+ if text_blocks:
33
+ paragraphs = text_blocks.split("\n")
34
+ for para in paragraphs:
35
+ extracted_content.append({"type": "text", "content": para})
36
+
37
+ return extracted_content
38
+
39
+
40
+ def translate_content_blockwise(content_blocks):
41
+ """Translate text blocks and return structured results."""
42
+ translated_content = []
43
+
44
+ for block in content_blocks:
45
+ if block["type"] == "text" and block["content"].strip():
46
+ detected_language = detect(block["content"])
47
+ if detected_language != "en":
48
+ translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
49
+ else:
50
+ translated_text = block["content"]
51
+ translated_content.append({"type": "text", "content": translated_text})
52
 
53
+ elif block["type"] == "table":
54
+ # Translate table rows
55
+ translated_table = []
56
+ for row in block["content"]:
57
+ translated_row = [translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row]
58
+ translated_table.append(translated_row)
59
+ translated_content.append({"type": "table", "content": translated_table})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ return translated_content
 
62
 
 
 
63
 
64
+ def generate_translated_docx(translated_content, output_path):
65
+ """Generate a Word document with the translated content preserving tables and formatting."""
66
+ doc = docx.Document()
 
 
 
 
67
 
68
+ for block in translated_content:
69
+ if block["type"] == "text":
70
+ para = doc.add_paragraph(block["content"])
71
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
72
+ para.style.font.size = Pt(12)
73
+
74
+ elif block["type"] == "table":
75
+ table = doc.add_table(rows=len(block["content"]), cols=len(block["content"][0]))
76
+ for i, row in enumerate(block["content"]):
77
+ for j, cell_text in enumerate(row):
78
+ table.cell(i, j).text = cell_text
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  doc.save(output_path)
81
 
82
 
83
+ # Streamlit UI
84
+ st.title("Professional Multilingual PDF Translator")
85
+ uploaded_file = st.file_uploader("Upload a PDF document for structured translation", type=["pdf"])
86
 
87
  if uploaded_file is not None:
88
+ with st.spinner("Processing and translating the document..."):
89
+ temp_file_path = "uploaded_document.pdf"
 
90
  with open(temp_file_path, "wb") as f:
91
  f.write(uploaded_file.getbuffer())
92
 
93
  try:
94
+ # Extract structured content
95
+ content_blocks = extract_text_blocks_from_pdf(temp_file_path)
96
+
97
+ # Translate content blockwise
98
+ translated_content = translate_content_blockwise(content_blocks)
99
+
100
+ # Create translated DOCX file
101
+ output_docx_path = "translated_document.docx"
102
+ generate_translated_docx(translated_content, output_docx_path)
103
+
104
+ # Provide download link for the translated document
105
+ with open(output_docx_path, "rb") as f:
106
+ st.download_button(
107
+ label="Download Translated Document",
108
+ data=f,
109
+ file_name="translated_document.docx",
110
+ mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
111
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  finally:
113
+ # Clean up temporary files
114
  if os.path.exists(temp_file_path):
115
  os.remove(temp_file_path)
116
+ if os.path.exists(output_docx_path):
117
+ os.remove(output_docx_path)