tahirsher commited on
Commit
ded567d
·
verified ·
1 Parent(s): 59f49e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -43
app.py CHANGED
@@ -2,11 +2,12 @@ import streamlit as st
2
  import PyPDF2
3
  import docx2txt
4
  from transformers import pipeline
5
- import sentencepiece
 
 
6
 
7
  # Load translation models
8
  def load_translation_models():
9
- """Load translation models for English and Urdu."""
10
  try:
11
  translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
12
  translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
@@ -17,59 +18,33 @@ def load_translation_models():
17
 
18
  translator_en, translator_ur = load_translation_models()
19
 
20
- def extract_text_from_pdf(file):
21
- """Extract text from a PDF file."""
22
  text = ""
23
  try:
24
- pdf_reader = PyPDF2.PdfReader(file)
25
- for page in pdf_reader.pages:
26
- text += page.extract_text() or ""
 
 
27
  except Exception as e:
28
- st.error(f"Error extracting text from PDF: {e}")
29
  return text
30
 
31
- def extract_text_from_word(file):
32
- """Extract text from a Word file."""
33
- try:
34
- return docx2txt.process(file)
35
- except Exception as e:
36
- st.error(f"Error extracting text from Word document: {e}")
37
- return ""
38
-
39
- def translate_text(text, translator):
40
- """Translate text in manageable chunks."""
41
- max_chunk_size = 512
42
- text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
43
- translations = []
44
-
45
- for chunk in text_chunks:
46
- try:
47
- result = translator(chunk)
48
- translations.append(result[0]['translation_text'])
49
- except Exception as e:
50
- st.error(f"Error during translation: {e}")
51
- return ""
52
- return " ".join(translations)
53
-
54
- # Streamlit UI
55
- st.title("📚 Multilingual Document Translator")
56
- st.write("Translate PDF or Word documents to English and Urdu effortlessly!")
57
-
58
  uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
59
  target_language = st.radio("Select target language for translation", ["English", "Urdu"])
60
 
61
  if uploaded_file:
62
- # Extract text from the uploaded file
63
- if uploaded_file.name.endswith(".pdf"):
64
- text_content = extract_text_from_pdf(uploaded_file)
65
- else:
66
- text_content = extract_text_from_word(uploaded_file)
67
-
68
- # Show extracted text preview
69
  st.subheader("Extracted Text (Preview)")
70
  st.write(text_content[:500] if text_content else "No content found in the file.")
71
 
72
- # Perform translation when the user clicks the button
73
  if st.button("Translate"):
74
  if text_content:
75
  st.subheader(f"Translated Text ({target_language})")
 
2
  import PyPDF2
3
  import docx2txt
4
  from transformers import pipeline
5
+ import pytesseract
6
+ from pdf2image import convert_from_path
7
+ from PIL import Image
8
 
9
  # Load translation models
10
  def load_translation_models():
 
11
  try:
12
  translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
13
  translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
 
18
 
19
  translator_en, translator_ur = load_translation_models()
20
 
21
+ def extract_text_from_pdf_with_ocr(file_path):
22
+ """Extract text from image-based PDF using OCR."""
23
  text = ""
24
  try:
25
+ # Convert PDF to images
26
+ pages = convert_from_path(file_path, 300)
27
+ for page in pages:
28
+ image = Image.fromarray(page)
29
+ text += pytesseract.image_to_string(image) + "\n"
30
  except Exception as e:
31
+ st.error(f"Error during OCR extraction: {e}")
32
  return text
33
 
34
+ # Streamlit UI for document translation
35
+ st.title("📚 Multilingual Document Translator with OCR Support")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
37
  target_language = st.radio("Select target language for translation", ["English", "Urdu"])
38
 
39
  if uploaded_file:
40
+ file_path = f"/mnt/data/{uploaded_file.name}"
41
+
42
+ # OCR-based text extraction for PDFs
43
+ text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file)
44
+
 
 
45
  st.subheader("Extracted Text (Preview)")
46
  st.write(text_content[:500] if text_content else "No content found in the file.")
47
 
 
48
  if st.button("Translate"):
49
  if text_content:
50
  st.subheader(f"Translated Text ({target_language})")