tahirsher commited on
Commit
f53330e
·
verified ·
1 Parent(s): 589a63e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -23
app.py CHANGED
@@ -1,46 +1,70 @@
1
  import streamlit as st
2
- import docx2txt
3
- from transformers import pipeline
4
  import pytesseract
5
  from pdf2image import convert_from_path
6
  from PIL import Image
7
- import os
8
 
9
- # Initialize translation models
 
 
 
 
 
 
 
 
 
 
 
 
10
  def load_translation_models():
11
- """Load translation models."""
12
- try:
13
- translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
14
- translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
15
- return translator_en, translator_ur
16
- except Exception as e:
17
- st.error(f"Error initializing translation models: {e}")
18
- return None, None
19
 
20
  translator_en, translator_ur = load_translation_models()
 
21
 
22
  def extract_text_from_pdf_with_ocr(file_path):
23
- """Extract text from image-based PDF using OCR."""
24
  text = ""
25
  try:
26
- # Convert PDF to images with Poppler support
27
- pages = convert_from_path(file_path, 300, poppler_path="/path-to-poppler-bin") # Update poppler_path on Windows if necessary
28
- for page in pages:
29
- text += pytesseract.image_to_string(page) + "\n"
30
  except Exception as e:
31
  st.error(f"Error during OCR extraction: {e}")
32
  return text
33
 
34
- # Streamlit UI for document translation
35
- st.title("📚 Multilingual Document Translator with OCR Support")
36
- uploaded_file = st.file_uploader("Upload a PDF or Word file", type=["pdf", "docx"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  target_language = st.radio("Select target language for translation", ["English", "Urdu"])
38
 
39
  if uploaded_file:
40
  file_path = f"/mnt/data/{uploaded_file.name}"
41
-
42
- # OCR-based text extraction for PDFs
43
- text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_from_word(uploaded_file)
44
 
45
  st.subheader("Extracted Text (Preview)")
46
  st.write(text_content[:500] if text_content else "No content found in the file.")
 
1
  import streamlit as st
2
+ from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
 
3
  import pytesseract
4
  from pdf2image import convert_from_path
5
  from PIL import Image
6
+ import torch
7
 
8
+ # Image-to-Text Model (TrOCR)
9
+ def load_image_to_text_model():
10
+ tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
11
+ model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
12
+ return tokenizer, model
13
+
14
+ def extract_text_with_trocr(image, tokenizer, model):
15
+ """Extract text from an image using TrOCR."""
16
+ pixel_values = tokenizer(image, return_tensors="pt").pixel_values
17
+ outputs = model.generate(pixel_values)
18
+ return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
19
+
20
+ # Multilingual Translation Models
21
  def load_translation_models():
22
+ translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
23
+ translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
24
+ return translator_en, translator_ur
 
 
 
 
 
25
 
26
  translator_en, translator_ur = load_translation_models()
27
+ tokenizer, trocr_model = load_image_to_text_model()
28
 
29
  def extract_text_from_pdf_with_ocr(file_path):
30
+ """Extract text from image-based PDF using TrOCR."""
31
  text = ""
32
  try:
33
+ pages = convert_from_path(file_path, 300)
34
+ for page_image in pages:
35
+ text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n"
 
36
  except Exception as e:
37
  st.error(f"Error during OCR extraction: {e}")
38
  return text
39
 
40
+ # Translation Function
41
+ def translate_text(text, translator):
42
+ """Translate text into the selected language."""
43
+ max_chunk_size = 512
44
+ text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
45
+ translations = []
46
+
47
+ for chunk in text_chunks:
48
+ try:
49
+ result = translator(chunk)
50
+ translations.append(result[0]['translation_text'])
51
+ except Exception as e:
52
+ st.error(f"Error during translation: {e}")
53
+ return ""
54
+ return " ".join(translations)
55
+
56
+ # Streamlit UI
57
+ st.title("📚 Image-Based Document Translator with TrOCR and Translation Models")
58
+ st.write("Translate image-based PDF or image files using advanced models.")
59
+
60
+ uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"])
61
  target_language = st.radio("Select target language for translation", ["English", "Urdu"])
62
 
63
  if uploaded_file:
64
  file_path = f"/mnt/data/{uploaded_file.name}"
65
+
66
+ # Image-based PDF processing using TrOCR
67
+ text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model)
68
 
69
  st.subheader("Extracted Text (Preview)")
70
  st.write(text_content[:500] if text_content else "No content found in the file.")