Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import fitz # PyMuPDF for PDF processing
|
2 |
from PIL import Image
|
|
|
3 |
from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
|
4 |
import streamlit as st
|
5 |
import os
|
@@ -7,33 +8,50 @@ import re
|
|
7 |
from docx import Document
|
8 |
from langdetect import detect
|
9 |
|
10 |
-
#
|
11 |
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
12 |
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
|
13 |
|
14 |
-
#
|
15 |
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
|
16 |
|
|
|
|
|
|
|
17 |
|
18 |
def extract_text_from_image(image):
|
19 |
-
"""Extract text from image using BLIP-2."""
|
20 |
-
#
|
21 |
image = image.convert("RGB")
|
22 |
inputs = processor(images=image, return_tensors="pt")
|
23 |
-
|
24 |
-
# Generate text from the image
|
25 |
generated_ids = model.generate(**inputs)
|
26 |
decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
27 |
|
|
|
|
|
|
|
|
|
28 |
return decoded_text.strip()
|
29 |
|
30 |
|
31 |
def extract_from_pdf(pdf_path):
|
|
|
32 |
doc = fitz.open(pdf_path)
|
33 |
full_text = ""
|
|
|
34 |
for page_num in range(len(doc)):
|
35 |
page = doc.load_page(page_num)
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
return full_text.strip()
|
38 |
|
39 |
|
|
|
1 |
import fitz # PyMuPDF for PDF processing
|
2 |
from PIL import Image
|
3 |
+
import pytesseract
|
4 |
from transformers import pipeline, Blip2Processor, Blip2ForConditionalGeneration
|
5 |
import streamlit as st
|
6 |
import os
|
|
|
8 |
from docx import Document
|
9 |
from langdetect import detect
|
10 |
|
11 |
+
# Initialize BLIP-2 model and processor for image-to-text
|
12 |
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
|
13 |
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
|
14 |
|
15 |
+
# Initialize translation pipeline
|
16 |
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
|
17 |
|
18 |
+
# Path to Tesseract executable for OCR
|
19 |
+
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
|
20 |
+
|
21 |
|
22 |
def extract_text_from_image(image):
|
23 |
+
"""Extract text from image using OCR or BLIP-2."""
|
24 |
+
# First try using BLIP-2
|
25 |
image = image.convert("RGB")
|
26 |
inputs = processor(images=image, return_tensors="pt")
|
|
|
|
|
27 |
generated_ids = model.generate(**inputs)
|
28 |
decoded_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
29 |
|
30 |
+
# Fallback to OCR if BLIP-2 extraction fails
|
31 |
+
if not decoded_text.strip():
|
32 |
+
decoded_text = pytesseract.image_to_string(image)
|
33 |
+
|
34 |
return decoded_text.strip()
|
35 |
|
36 |
|
37 |
def extract_from_pdf(pdf_path):
|
38 |
+
"""Extract text from PDF by combining direct extraction and OCR fallback."""
|
39 |
doc = fitz.open(pdf_path)
|
40 |
full_text = ""
|
41 |
+
|
42 |
for page_num in range(len(doc)):
|
43 |
page = doc.load_page(page_num)
|
44 |
+
|
45 |
+
# Try extracting text directly
|
46 |
+
text = page.get_text()
|
47 |
+
|
48 |
+
# If no text, fallback to OCR
|
49 |
+
if not text.strip():
|
50 |
+
pix = page.get_pixmap()
|
51 |
+
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
52 |
+
text = extract_text_from_image(image)
|
53 |
+
|
54 |
+
full_text += text + "\n"
|
55 |
return full_text.strip()
|
56 |
|
57 |
|