tahirsher commited on
Commit
87fcfea
·
verified ·
1 Parent(s): f9d2d7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -47
app.py CHANGED
@@ -1,33 +1,30 @@
1
- import fitz # PyMuPDF
2
- import pytesseract
3
- from PIL import Image
4
- from transformers import pipeline
5
  import streamlit as st
6
  import os
7
  import io
 
8
 
9
- # Set the Tesseract path explicitly
10
- pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' # Default path in most Linux systems
 
11
 
12
- # Set up the translation pipelines with error handling
13
- try:
14
- translator_to_english = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
15
- except Exception as e:
16
- st.error(f"Failed to load English translation model: {e}")
17
- translator_to_english = None
18
 
19
- try:
20
- translator_to_urdu = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur")
21
- except Exception as e:
22
- st.error(f"Failed to load Urdu translation model: {e}")
23
- translator_to_urdu = None
24
 
25
- # Function to extract text from an image using OCR
26
  def extract_text_from_image(image):
27
- text = pytesseract.image_to_string(image, lang='eng+urd')
 
 
28
  return text
29
 
30
- # Function to extract images and text from a PDF
31
  def extract_from_pdf(pdf_path):
32
  doc = fitz.open(pdf_path)
33
  full_text = ""
@@ -44,38 +41,70 @@ def extract_from_pdf(pdf_path):
44
  full_text += page.get_text() + "\n"
45
  return full_text
46
 
47
- # Function to translate text to English and Urdu
 
 
 
 
 
 
 
 
48
  def translate_text(text):
49
- english_translation = ""
50
- urdu_translation = ""
51
- if translator_to_english:
52
- english_translation = translator_to_english(text, max_length=400)[0]['translation_text']
53
- if translator_to_urdu:
54
- urdu_translation = translator_to_urdu(text, max_length=400)[0]['translation_text']
55
- return english_translation, urdu_translation
 
 
56
 
57
  # Streamlit UI
58
- st.title("PDF Document Translator")
59
- uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
60
 
61
  if uploaded_file is not None:
62
- with st.spinner("Processing PDF..."):
63
  # Save the uploaded file temporarily
64
- with open("temp.pdf", "wb") as f:
 
 
65
  f.write(uploaded_file.getbuffer())
66
-
67
- # Extract text from the PDF
68
- extracted_text = extract_from_pdf("temp.pdf")
69
-
 
 
 
 
 
 
 
 
 
70
  # Translate the extracted text
71
- english_translation, urdu_translation = translate_text(extracted_text)
72
-
73
- # Display the translations
74
- st.subheader("English Translation")
75
- st.write(english_translation)
76
-
77
- st.subheader("Urdu Translation")
78
- st.write(urdu_translation)
79
-
80
- # Clean up the temporary file
81
- os.remove("temp.pdf")
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF for PDF processing
2
+ from PIL import Image # For image processing
3
+ from transformers import AutoTokenizer, AutoModelForImageTextToText, AutoModelForCausalLM, pipeline
 
4
  import streamlit as st
5
  import os
6
  import io
7
+ from docx import Document # For Word document processing
8
 
9
+ # Load the TrOCR model for image-to-text
10
+ trocr_tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
11
+ trocr_model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
12
 
13
+ # Load the DeepSeek model for text-to-text translation
14
+ translation_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
15
+ translation_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Llama-8B")
 
 
 
16
 
17
+ # Set up the translation pipeline
18
+ translator = pipeline("text-generation", model=translation_model, tokenizer=translation_tokenizer)
 
 
 
19
 
20
+ # Function to extract text from an image using TrOCR
21
  def extract_text_from_image(image):
22
+ inputs = trocr_tokenizer(image, return_tensors="pt").input_ids
23
+ outputs = trocr_model.generate(inputs)
24
+ text = trocr_tokenizer.decode(outputs[0], skip_special_tokens=True)
25
  return text
26
 
27
+ # Function to extract text from a PDF
28
  def extract_from_pdf(pdf_path):
29
  doc = fitz.open(pdf_path)
30
  full_text = ""
 
41
  full_text += page.get_text() + "\n"
42
  return full_text
43
 
44
+ # Function to extract text from a Word document
45
+ def extract_from_word(docx_path):
46
+ doc = Document(docx_path)
47
+ full_text = ""
48
+ for para in doc.paragraphs:
49
+ full_text += para.text + "\n"
50
+ return full_text
51
+
52
+ # Function to translate text to English
53
  def translate_text(text):
54
+ translated_text = translator(text, max_length=400)[0]['generated_text']
55
+ return translated_text
56
+
57
+ # Function to create a PDF from translated text
58
+ def create_pdf(translated_text, output_path):
59
+ doc = fitz.open()
60
+ page = doc.new_page()
61
+ page.insert_text((50, 50), translated_text, fontsize=12, fontname="helv")
62
+ doc.save(output_path)
63
 
64
  # Streamlit UI
65
+ st.title("Multilingual Document Translator")
66
+ uploaded_file = st.file_uploader("Upload a document (PDF, Word, or Image)", type=["pdf", "docx", "jpg", "jpeg", "png"])
67
 
68
  if uploaded_file is not None:
69
+ with st.spinner("Processing document..."):
70
  # Save the uploaded file temporarily
71
+ file_extension = uploaded_file.name.split(".")[-1].lower()
72
+ temp_file_path = f"temp.{file_extension}"
73
+ with open(temp_file_path, "wb") as f:
74
  f.write(uploaded_file.getbuffer())
75
+
76
+ # Extract text based on file type
77
+ if file_extension == "pdf":
78
+ extracted_text = extract_from_pdf(temp_file_path)
79
+ elif file_extension in ["jpg", "jpeg", "png"]:
80
+ image = Image.open(temp_file_path)
81
+ extracted_text = extract_text_from_image(image)
82
+ elif file_extension == "docx":
83
+ extracted_text = extract_from_word(temp_file_path)
84
+ else:
85
+ st.error("Unsupported file format.")
86
+ st.stop()
87
+
88
  # Translate the extracted text
89
+ translated_text = translate_text(extracted_text)
90
+
91
+ # Display the translated text
92
+ st.subheader("Translated Text (English)")
93
+ st.write(translated_text)
94
+
95
+ # Create a PDF from the translated text
96
+ output_pdf_path = "translated_document.pdf"
97
+ create_pdf(translated_text, output_pdf_path)
98
+
99
+ # Provide a download link for the translated PDF
100
+ with open(output_pdf_path, "rb") as f:
101
+ st.download_button(
102
+ label="Download Translated PDF",
103
+ data=f,
104
+ file_name="translated_document.pdf",
105
+ mime="application/pdf"
106
+ )
107
+
108
+ # Clean up temporary files
109
+ os.remove(temp_file_path)
110
+ os.remove(output_pdf_path)