tahirsher commited on
Commit
7037128
·
verified ·
1 Parent(s): 7149616

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -75
app.py CHANGED
@@ -1,84 +1,64 @@
1
- import streamlit as st
2
- from transformers import pipeline, AutoTokenizer, AutoModelForImageTextToText
3
  import pytesseract
4
- from pdf2image import convert_from_path
5
  from PIL import Image
6
- import torch
7
-
8
- # Image-to-Text Model (TrOCR)
9
- def load_image_to_text_model():
10
- tokenizer = AutoTokenizer.from_pretrained("microsoft/trocr-large-printed")
11
- model = AutoModelForImageTextToText.from_pretrained("microsoft/trocr-large-printed")
12
- return tokenizer, model
13
-
14
- def extract_text_with_trocr(image, tokenizer, model):
15
- """Extract text from an image using TrOCR."""
16
- pixel_values = tokenizer(image, return_tensors="pt").pixel_values
17
- outputs = model.generate(pixel_values)
18
- return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
19
-
20
- # Multilingual Translation Models
21
- def load_translation_models():
22
- translator_en = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en", framework="pt")
23
- translator_ur = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ur", framework="pt")
24
- return translator_en, translator_ur
25
 
26
- translator_en, translator_ur = load_translation_models()
27
- tokenizer, trocr_model = load_image_to_text_model()
 
28
 
29
- def extract_text_from_pdf_with_ocr(file_path):
30
- """Extract text from image-based PDF using TrOCR."""
31
- text = ""
32
- try:
33
- pages = convert_from_path(file_path, 300)
34
- for page_image in pages:
35
- text += extract_text_with_trocr(page_image, tokenizer, trocr_model) + "\n"
36
- except Exception as e:
37
- st.error(f"Error during OCR extraction: {e}")
38
  return text
39
 
40
- # Translation Function
41
- def translate_text(text, translator):
42
- """Translate text into the selected language."""
43
- max_chunk_size = 512
44
- text_chunks = [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]
45
- translations = []
46
-
47
- for chunk in text_chunks:
48
- try:
49
- result = translator(chunk)
50
- translations.append(result[0]['translation_text'])
51
- except Exception as e:
52
- st.error(f"Error during translation: {e}")
53
- return ""
54
- return " ".join(translations)
 
55
 
56
- # Streamlit UI
57
- st.title("📚 Image-Based Document Translator with TrOCR and Translation Models")
58
- st.write("Translate image-based PDF or image files using advanced models.")
59
-
60
- uploaded_file = st.file_uploader("Upload a PDF or Image file (JPG/PNG)", type=["pdf", "jpg", "png"])
61
- target_language = st.radio("Select target language for translation", ["English", "Urdu"])
62
 
63
- if uploaded_file:
64
- file_path = f"/mnt/data/{uploaded_file.name}"
65
-
66
- # Image-based PDF processing using TrOCR
67
- text_content = extract_text_from_pdf_with_ocr(file_path) if uploaded_file.name.endswith(".pdf") else extract_text_with_trocr(Image.open(uploaded_file), tokenizer, trocr_model)
68
-
69
- st.subheader("Extracted Text (Preview)")
70
- st.write(text_content[:500] if text_content else "No content found in the file.")
71
 
72
- if st.button("Translate"):
73
- if text_content:
74
- st.subheader(f"Translated Text ({target_language})")
75
- if target_language == "English" and translator_en:
76
- translated_text = translate_text(text_content, translator_en)
77
- elif target_language == "Urdu" and translator_ur:
78
- translated_text = translate_text(text_content, translator_ur)
79
- else:
80
- st.warning("Translation model not loaded successfully.")
81
-
82
- st.text_area("Translation Output", translated_text, height=300)
83
- else:
84
- st.warning("No text found to translate. Please upload a valid document.")
 
 
 
 
 
 
 
 
 
1
+ import fitz # PyMuPDF
 
2
  import pytesseract
 
3
  from PIL import Image
4
+ from transformers import pipeline
5
+ import streamlit as st
6
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Set up the translation pipelines
9
+ translator_to_english = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
10
+ translator_to_urdu = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-ur")
11
 
12
+ # Function to extract text from an image using OCR
13
+ def extract_text_from_image(image):
14
+ text = pytesseract.image_to_string(image, lang='eng+urd')
 
 
 
 
 
 
15
  return text
16
 
17
+ # Function to extract images and text from a PDF
18
+ def extract_from_pdf(pdf_path):
19
+ doc = fitz.open(pdf_path)
20
+ full_text = ""
21
+ for page_num in range(len(doc)):
22
+ page = doc.load_page(page_num)
23
+ image_list = page.get_images(full=True)
24
+ for img_index, img in enumerate(image_list):
25
+ xref = img[0]
26
+ base_image = doc.extract_image(xref)
27
+ image_bytes = base_image["image"]
28
+ image = Image.open(io.BytesIO(image_bytes))
29
+ text = extract_text_from_image(image)
30
+ full_text += text + "\n"
31
+ full_text += page.get_text() + "\n"
32
+ return full_text
33
 
34
+ # Function to translate text to English and Urdu
35
+ def translate_text(text):
36
+ english_translation = translator_to_english(text, max_length=400)[0]['translation_text']
37
+ urdu_translation = translator_to_urdu(text, max_length=400)[0]['translation_text']
38
+ return english_translation, urdu_translation
 
39
 
40
+ # Streamlit UI
41
+ st.title("PDF Document Translator")
42
+ uploaded_file = st.file_uploader("Upload a PDF document", type="pdf")
 
 
 
 
 
43
 
44
+ if uploaded_file is not None:
45
+ with st.spinner("Processing PDF..."):
46
+ # Save the uploaded file temporarily
47
+ with open("temp.pdf", "wb") as f:
48
+ f.write(uploaded_file.getbuffer())
49
+
50
+ # Extract text from the PDF
51
+ extracted_text = extract_from_pdf("temp.pdf")
52
+
53
+ # Translate the extracted text
54
+ english_translation, urdu_translation = translate_text(extracted_text)
55
+
56
+ # Display the translations
57
+ st.subheader("English Translation")
58
+ st.write(english_translation)
59
+
60
+ st.subheader("Urdu Translation")
61
+ st.write(urdu_translation)
62
+
63
+ # Clean up the temporary file
64
+ os.remove("temp.pdf")