tahirsher commited on
Commit
c06572a
·
verified ·
1 Parent(s): 53afdc8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -46
app.py CHANGED
@@ -1,71 +1,72 @@
1
- import pdfplumber
2
  import pytesseract
 
 
3
  from transformers import pipeline
4
  import streamlit as st
5
- import os
6
  import docx
7
- from langdetect import detect
8
- from PIL import Image
9
  from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
10
  from docx.shared import Pt
 
11
 
12
- # Load the translation model
13
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
14
 
15
- # Ensure Tesseract path is set (modify for your environment)
16
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
17
 
18
 
19
- def extract_text_blocks_from_pdf(pdf_path):
20
- """Extract text blocks while preserving structure (tables, paragraphs) from the PDF."""
21
- extracted_content = []
22
-
23
  with pdfplumber.open(pdf_path) as pdf:
24
  for page in pdf.pages:
25
  # Extract tables
26
  tables = page.extract_tables()
27
  for table in tables:
28
- extracted_content.append({"type": "table", "content": table})
29
-
30
- # Extract plain text
31
- text_blocks = page.extract_text()
32
- if text_blocks:
33
- paragraphs = text_blocks.split("\n")
 
 
 
 
34
  for para in paragraphs:
35
- extracted_content.append({"type": "text", "content": para})
 
 
36
 
37
- return extracted_content
38
 
 
 
 
39
 
40
- def translate_content_blockwise(content_blocks):
41
- """Translate text blocks and return structured results."""
42
- translated_content = []
43
-
44
  for block in content_blocks:
45
  if block["type"] == "text" and block["content"].strip():
46
- detected_language = detect(block["content"])
47
- if detected_language != "en":
48
- translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
49
- else:
50
- translated_text = block["content"]
51
- translated_content.append({"type": "text", "content": translated_text})
52
-
53
  elif block["type"] == "table":
54
- # Translate table rows
55
  translated_table = []
56
  for row in block["content"]:
57
- translated_row = [translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row]
 
 
58
  translated_table.append(translated_row)
59
- translated_content.append({"type": "table", "content": translated_table})
60
-
61
- return translated_content
62
 
63
 
64
- def generate_translated_docx(translated_content, output_path):
65
- """Generate a Word document with the translated content preserving tables and formatting."""
66
  doc = docx.Document()
67
 
68
- for block in translated_content:
69
  if block["type"] == "text":
70
  para = doc.add_paragraph(block["content"])
71
  para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
@@ -91,16 +92,16 @@ if uploaded_file is not None:
91
  f.write(uploaded_file.getbuffer())
92
 
93
  try:
94
- # Extract structured content
95
- content_blocks = extract_text_blocks_from_pdf(temp_file_path)
96
-
97
- # Translate content blockwise
98
- translated_content = translate_content_blockwise(content_blocks)
99
-
100
- # Create translated DOCX file
101
  output_docx_path = "translated_document.docx"
102
- generate_translated_docx(translated_content, output_docx_path)
103
-
104
  # Provide download link for the translated document
105
  with open(output_docx_path, "rb") as f:
106
  st.download_button(
 
1
+ import PyPDF2
2
  import pytesseract
3
+ from PIL import Image
4
+ import pdfplumber
5
  from transformers import pipeline
6
  import streamlit as st
 
7
  import docx
 
 
8
  from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
9
  from docx.shared import Pt
10
+ import os
11
 
12
+ # Translation model pipeline
13
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
14
 
15
+ # Set Tesseract path (modify for your environment)
16
  pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
17
 
18
 
19
+ def extract_text_and_tables(pdf_path):
20
+ """Extract structured content from PDF, including tables and text."""
21
+ content_blocks = []
22
+
23
  with pdfplumber.open(pdf_path) as pdf:
24
  for page in pdf.pages:
25
  # Extract tables
26
  tables = page.extract_tables()
27
  for table in tables:
28
+ content_blocks.append({"type": "table", "content": table})
29
+
30
+ # Extract text as paragraphs
31
+ text = page.extract_text()
32
+ if not text: # Fallback to OCR if text extraction fails
33
+ pix = page.to_image()
34
+ text = pytesseract.image_to_string(pix.original)
35
+
36
+ if text:
37
+ paragraphs = text.split("\n")
38
  for para in paragraphs:
39
+ content_blocks.append({"type": "text", "content": para.strip()})
40
+
41
+ return content_blocks
42
 
 
43
 
44
+ def translate_content(content_blocks):
45
+ """Translate extracted content preserving structure."""
46
+ translated_blocks = []
47
 
 
 
 
 
48
  for block in content_blocks:
49
  if block["type"] == "text" and block["content"].strip():
50
+ translated_text = translator(block["content"], max_length=400)[0]["translation_text"]
51
+ translated_blocks.append({"type": "text", "content": translated_text})
52
+
 
 
 
 
53
  elif block["type"] == "table":
 
54
  translated_table = []
55
  for row in block["content"]:
56
+ translated_row = [
57
+ translator(cell, max_length=400)[0]["translation_text"] if cell else "" for cell in row
58
+ ]
59
  translated_table.append(translated_row)
60
+ translated_blocks.append({"type": "table", "content": translated_table})
61
+
62
+ return translated_blocks
63
 
64
 
65
+ def create_translated_doc(translated_blocks, output_path):
66
+ """Generate a translated Word document preserving tables and text."""
67
  doc = docx.Document()
68
 
69
+ for block in translated_blocks:
70
  if block["type"] == "text":
71
  para = doc.add_paragraph(block["content"])
72
  para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
 
92
  f.write(uploaded_file.getbuffer())
93
 
94
  try:
95
+ # Extract content from the PDF
96
+ content_blocks = extract_text_and_tables(temp_file_path)
97
+
98
+ # Translate content
99
+ translated_blocks = translate_content(content_blocks)
100
+
101
+ # Generate translated DOCX
102
  output_docx_path = "translated_document.docx"
103
+ create_translated_doc(translated_blocks, output_docx_path)
104
+
105
  # Provide download link for the translated document
106
  with open(output_docx_path, "rb") as f:
107
  st.download_button(