ikraamkb commited on
Commit
f94fa3b
·
verified ·
1 Parent(s): 607327a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -36
app.py CHANGED
@@ -1,52 +1,48 @@
1
- from fastapi import FastAPI
2
- from fastapi.responses import RedirectResponse
3
- import gradio as gr
 
4
  from PIL import Image
5
- import numpy as np
6
- from transformers import pipeline
7
  from gtts import gTTS
 
8
  import tempfile
9
- import os
10
- import pytesseract # ✅ Replacing easyocr
11
-
12
-
13
 
14
  app = FastAPI()
15
 
16
- # Models
17
- caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
18
- vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
19
-
20
 
21
- def process_image_question(image: Image.Image, question: str):
22
- if image is None:
23
- return "No image uploaded.", None
24
 
25
- try:
26
- # Convert image to numpy
27
- np_image = np.array(image)
28
 
29
- # OCR text using pytesseract
30
- extracted_text = pytesseract.image_to_string(image)
 
 
 
 
31
 
32
- # Caption
33
- caption = caption_model(image)[0]['generated_text']
34
-
35
- # Visual QA
36
- vqa_result = vqa_model(image=image, question=question)
37
- answer = vqa_result[0]['answer']
38
-
39
- # Answer as speech
40
  tts = gTTS(text=answer)
41
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
42
  tts.save(tmp.name)
43
  audio_path = tmp.name
 
 
44
 
45
- final_output = f"🖼️ Caption: {caption}\n\n📖 OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"
46
- return final_output, audio_path
 
 
 
 
47
 
48
- except Exception as e:
49
- return f"❌ Error: {e}", None
50
 
51
  gui = gr.Interface(
52
  fn=process_image_question,
@@ -55,15 +51,15 @@ gui = gr.Interface(
55
  gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
56
  ],
57
  outputs=[
58
- gr.Textbox(label="Result", lines=10),
59
  gr.Audio(label="Answer (Audio)", type="filepath")
60
  ],
61
  title="🧠 Image QA with Voice",
62
- description="Upload an image and ask any question — even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud."
63
  )
64
 
65
  app = gr.mount_gradio_app(app, gui, path="/")
66
 
67
  @app.get("/")
68
  def home():
69
- return RedirectResponse(url="/")
 
1
+ from fastapi import FastAPI, UploadFile, Form
2
+ from fastapi.responses import RedirectResponse, FileResponse, JSONResponse
3
+ import os
4
+ import shutil
5
  from PIL import Image
6
+ from transformers import ViltProcessor, ViltForQuestionAnswering
 
7
  from gtts import gTTS
8
+ import torch
9
  import tempfile
10
+ import gradio as gr
 
 
 
11
 
12
  app = FastAPI()
13
 
14
+ # Load VQA Model
15
+ vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
16
+ vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
 
17
 
 
 
 
18
 
19
+ def answer_question_from_image(image, question):
20
+ if image is None or not question.strip():
21
+ return "Please upload an image and ask a question.", None
22
 
23
+ # Process with model
24
+ inputs = vqa_processor(image, question, return_tensors="pt")
25
+ with torch.no_grad():
26
+ outputs = vqa_model(**inputs)
27
+ predicted_id = outputs.logits.argmax(-1).item()
28
+ answer = vqa_model.config.id2label[predicted_id]
29
 
30
+ # Generate TTS audio
31
+ try:
 
 
 
 
 
 
32
  tts = gTTS(text=answer)
33
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
34
  tts.save(tmp.name)
35
  audio_path = tmp.name
36
+ except Exception as e:
37
+ return f"Answer: {answer}\n\n⚠️ Audio generation error: {e}", None
38
 
39
+ return answer, audio_path
40
+
41
+
42
+ def process_image_question(image: Image.Image, question: str):
43
+ answer, audio_path = answer_question_from_image(image, question)
44
+ return answer, audio_path
45
 
 
 
46
 
47
  gui = gr.Interface(
48
  fn=process_image_question,
 
51
  gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question")
52
  ],
53
  outputs=[
54
+ gr.Textbox(label="Answer", lines=5),
55
  gr.Audio(label="Answer (Audio)", type="filepath")
56
  ],
57
  title="🧠 Image QA with Voice",
58
+ description="Upload an image and ask a question. You'll get a text + spoken answer."
59
  )
60
 
61
  app = gr.mount_gradio_app(app, gui, path="/")
62
 
63
  @app.get("/")
64
  def home():
65
+ return RedirectResponse(url="/")