tahirsher commited on
Commit
40548f3
·
verified ·
1 Parent(s): 3eaf646

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -7
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import fitz # PyMuPDF for PDF processing
2
  from PIL import Image
 
3
  from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
4
  import streamlit as st
5
  import os
@@ -7,33 +8,50 @@ import re
7
  from docx import Document
8
  from langdetect import detect
9
 
10
- # Load BLIP-2 model and processor for image-to-text
11
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
12
  model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
13
 
14
- # Load translation model
15
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
16
 
 
 
 
17
 
18
  def extract_text_from_image(image):
19
- """Extract text from image using BLIP-2."""
20
- # Convert the image to RGB and preprocess
21
  image = image.convert("RGB")
22
  inputs = processor(images=image, return_tensors="pt")
23
-
24
- # Generate text from the image
25
  generated_ids = model.generate(**inputs)
26
  decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
27
 
 
 
 
 
28
  return decoded_text.strip()
29
 
30
 
31
  def extract_from_pdf(pdf_path):
 
32
  doc = fitz.open(pdf_path)
33
  full_text = ""
 
34
  for page_num in range(len(doc)):
35
  page = doc.load_page(page_num)
36
- full_text += page.get_text() + "\n"
 
 
 
 
 
 
 
 
 
 
37
  return full_text.strip()
38
 
39
 
 
1
  import fitz # PyMuPDF for PDF processing
2
  from PIL import Image
3
+ import pytesseract
4
  from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
5
  import streamlit as st
6
  import os
 
8
  from docx import Document
9
  from langdetect import detect
10
 
11
+ # Initialize BLIP-2 model and processor for image-to-text
12
  processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
13
  model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
14
 
15
+ # Initialize translation pipeline
16
  translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
17
 
18
+ # Path to Tesseract executable for OCR
19
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
20
+
21
 
22
  def extract_text_from_image(image):
23
+ """Extract text from image using OCR or BLIP-2."""
24
+ # First try using BLIP-2
25
  image = image.convert("RGB")
26
  inputs = processor(images=image, return_tensors="pt")
 
 
27
  generated_ids = model.generate(**inputs)
28
  decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
29
 
30
+ # Fallback to OCR if BLIP-2 extraction fails
31
+ if not decoded_text.strip():
32
+ decoded_text = pytesseract.image_to_string(image)
33
+
34
  return decoded_text.strip()
35
 
36
 
37
  def extract_from_pdf(pdf_path):
38
+ """Extract text from PDF by combining direct extraction and OCR fallback."""
39
  doc = fitz.open(pdf_path)
40
  full_text = ""
41
+
42
  for page_num in range(len(doc)):
43
  page = doc.load_page(page_num)
44
+
45
+ # Try extracting text directly
46
+ text = page.get_text()
47
+
48
+ # If no text, fallback to OCR
49
+ if not text.strip():
50
+ pix = page.get_pixmap()
51
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
52
+ text = extract_text_from_image(image)
53
+
54
+ full_text += text + "\n"
55
  return full_text.strip()
56
 
57