ikraamkb commited on
Commit
0d84ecf
·
verified ·
1 Parent(s): 8b98a2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -88
app.py CHANGED
@@ -1,99 +1,97 @@
1
- from fastapi import FastAPI, UploadFile, Form
2
- from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
 
 
 
3
  import os
4
- import shutil
5
  from PIL import Image
6
- from transformers import ViltProcessor, ViltForQuestionAnswering, pipeline
7
- from gtts import gTTS
 
 
8
  import easyocr
9
- import torch
10
- import tempfile
11
- import gradio as gr
12
- import numpy as np
13
-
14
- app = FastAPI()
15
 
16
-
17
-
18
- # Load VQA Model
19
- vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
20
- vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
21
-
22
- # Load image captioning model
23
  captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
 
24
 
 
25
 
26
- reader = easyocr.Reader(['en', 'fr'])
27
-
28
- def classify_question(question: str):
29
- question_lower = question.lower()
30
- if any(word in question_lower for word in ["text", "say", "written", "read"]):
31
- return "ocr"
32
- elif any(word in question_lower for word in ["caption", "describe", "what is in the image"]):
33
- return "caption"
34
- else:
35
- return "vqa"
36
-
37
- def answer_question_from_image(image, question):
38
- if image is None or not question.strip():
39
- return "Please upload an image and ask a question.", None
40
-
41
- mode = classify_question(question)
42
-
43
- if mode == "ocr":
44
- try:
45
- result = reader.readtext(np.array(image))
46
- text = " ".join([entry[1] for entry in result])
47
- answer = text.strip() or "No readable text found."
48
- except Exception as e:
49
- answer = f"OCR Error: {e}"
50
-
51
- elif mode == "caption":
52
- try:
53
- answer = captioner(image)[0]['generated_text']
54
- except Exception as e:
55
- answer = f"Captioning error: {e}"
56
-
57
- else:
58
- try:
59
- inputs = vqa_processor(image, question, return_tensors="pt")
60
- with torch.no_grad():
61
- outputs = vqa_model(**inputs)
62
- predicted_id = outputs.logits.argmax(-1).item()
63
- answer = vqa_model.config.id2label[predicted_id]
64
- except Exception as e:
65
- answer = f"VQA error: {e}"
66
-
67
  try:
68
- tts = gTTS(text=answer)
69
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
70
- tts.save(tmp.name)
71
- audio_path = tmp.name
 
 
 
 
 
 
 
 
 
 
72
  except Exception as e:
73
- return f"Answer: {answer}\n\n⚠️ Audio generation error: {e}", None
74
-
75
- return answer, audio_path
76
-
77
- def process_image_question(image: Image.Image, question: str):
78
- answer, audio_path = answer_question_from_image(image, question)
79
- return answer, audio_path
80
-
81
- gui = gr.Interface(
82
- fn=process_image_question,
83
- inputs=[
84
- gr.Image(type="pil", label="Upload Image"),
85
- gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
86
- ],
87
- outputs=[
88
- gr.Textbox(label="Answer", lines=5),
89
- gr.Audio(label="Answer (Audio)", type="filepath")
90
- ],
91
- title="🧐 Image QA with Voice",
92
- description="Upload an image and ask a question. Works for OCR, captioning, and VQA."
93
- )
94
-
95
- app = gr.mount_gradio_app(app, gui, path="/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  @app.get("/")
98
- def home():
99
- return RedirectResponse(url="/")
 
1
+ from fastapi import FastAPI, UploadFile, File, Form
2
+ from fastapi.responses import RedirectResponse
3
+ import gradio as gr
4
+ from transformers import pipeline
5
+ import tempfile
6
  import os
 
7
  from PIL import Image
8
+ import fitz # PyMuPDF
9
+ import docx
10
+ import openpyxl
11
+ from pptx import Presentation
12
  import easyocr
 
 
 
 
 
 
13
 
14
+ # Initialize models
15
+ summarizer = pipeline("text2text-generation", model="FeruzaBoynazarovaas/my_awesome_billsum_model")
 
 
 
 
 
16
  captioner = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
17
+ reader = easyocr.Reader(['en']) # For OCR
18
 
19
+ app = FastAPI()
20
 
21
+ def extract_text_from_file(file_path: str, file_type: str):
22
+ """Extract text from different document formats"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  try:
24
+ if file_type == "pdf":
25
+ with fitz.open(file_path) as doc:
26
+ return "\n".join(page.get_text() for page in doc)
27
+ elif file_type == "docx":
28
+ doc = docx.Document(file_path)
29
+ return "\n".join(p.text for p in doc.paragraphs)
30
+ elif file_type == "pptx":
31
+ prs = Presentation(file_path)
32
+ return "\n".join(shape.text for slide in prs.slides for shape in slide.shapes if hasattr(shape, "text"))
33
+ elif file_type == "xlsx":
34
+ wb = openpyxl.load_workbook(file_path)
35
+ return "\n".join(str(cell.value) for sheet in wb for row in sheet for cell in row)
36
+ else:
37
+ return "Unsupported file format"
38
  except Exception as e:
39
+ return f"Error reading file: {str(e)}"
40
+
41
+ def process_document(file):
42
+ """Handle document upload and summarization"""
43
+ # Save temp file
44
+ file_ext = os.path.splitext(file.name)[1][1:].lower()
45
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_ext}") as tmp:
46
+ tmp.write(file.read())
47
+ tmp_path = tmp.name
48
+
49
+ # Extract and summarize
50
+ text = extract_text_from_file(tmp_path, file_ext)
51
+ summary = summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['generated_text']
52
+
53
+ # Cleanup
54
+ os.unlink(tmp_path)
55
+ return summary
56
+
57
+ def process_image(image):
58
+ """Handle image captioning and OCR"""
59
+ img = Image.open(image)
60
+
61
+ # Get caption
62
+ caption = captioner(img)[0]['generated_text']
63
+
64
+ # Get OCR text
65
+ ocr_result = reader.readtext(img)
66
+ ocr_text = " ".join([res[1] for res in ocr_result])
67
+
68
+ return {
69
+ "caption": caption,
70
+ "ocr_text": ocr_text if ocr_text else "No readable text found"
71
+ }
72
+
73
+ # Gradio Interface
74
+ with gr.Blocks() as demo:
75
+ gr.Markdown("# 📄 Document & Image Analysis Web Service")
76
+
77
+ with gr.Tab("Document Summarization"):
78
+ doc_input = gr.File(label="Upload Document (PDF, DOCX, PPTX, XLSX)")
79
+ doc_output = gr.Textbox(label="Summary")
80
+ doc_button = gr.Button("Summarize")
81
+
82
+ with gr.Tab("Image Analysis"):
83
+ img_input = gr.Image(type="filepath", label="Upload Image")
84
+ with gr.Accordion("Results", open=True):
85
+ caption_output = gr.Textbox(label="Image Caption")
86
+ ocr_output = gr.Textbox(label="Extracted Text")
87
+ img_button = gr.Button("Analyze")
88
+
89
+ doc_button.click(process_document, inputs=doc_input, outputs=doc_output)
90
+ img_button.click(process_image, inputs=img_input, outputs=[caption_output, ocr_output])
91
+
92
+ # Mount Gradio app
93
+ app = gr.mount_gradio_app(app, demo, path="/")
94
 
95
  @app.get("/")
96
+ def redirect_to_gradio():
97
+ return RedirectResponse(url="/")