ikraamkb commited on
Commit
b298682
ยท
verified ยท
1 Parent(s): a48ca23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -22
app.py CHANGED
@@ -7,56 +7,45 @@ from transformers import pipeline
7
  from gtts import gTTS
8
  import tempfile
9
  import os
10
-
11
-
12
 
13
  app = FastAPI()
14
 
15
- # OCR Reader (lazy import inside function to avoid ImportError on Spaces)
16
- ocr_reader = None
17
-
18
- # Captioning and VQA Pipelines
19
  caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
20
  vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
 
21
 
22
  def process_image_question(image: Image.Image, question: str):
23
  if image is None:
24
  return "No image uploaded.", None
25
 
26
  try:
27
- # Import EasyOCR only when needed
28
- global ocr_reader
29
- if ocr_reader is None:
30
- import easyocr
31
- ocr_reader = easyocr.Reader(['en'], gpu=False)
32
-
33
- # Convert PIL image to numpy array
34
  np_image = np.array(image)
35
 
36
- # OCR extraction
37
  ocr_texts = ocr_reader.readtext(np_image, detail=0)
38
  extracted_text = "\n".join(ocr_texts)
39
 
40
- # Generate caption
41
  caption = caption_model(image)[0]['generated_text']
42
 
43
- # Ask question on image using VQA
44
  vqa_result = vqa_model(image=image, question=question)
45
  answer = vqa_result[0]['answer']
46
 
47
- # Combine results
48
- final_output = f"๐Ÿ–ผ๏ธ Caption: {caption}\n\n๐Ÿ“– OCR Text:\n{extracted_text}\n\nโ“ Answer: {answer}"
49
-
50
- # Convert answer to speech
51
  tts = gTTS(text=answer)
52
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
53
  tts.save(tmp.name)
54
  audio_path = tmp.name
55
 
 
56
  return final_output, audio_path
57
 
58
  except Exception as e:
59
- return f"โŒ Error processing image: {e}", None
60
 
61
  gui = gr.Interface(
62
  fn=process_image_question,
@@ -69,7 +58,7 @@ gui = gr.Interface(
69
  gr.Audio(label="Answer (Audio)", type="filepath")
70
  ],
71
  title="๐Ÿง  Image QA with Voice",
72
- description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response."
73
  )
74
 
75
  app = gr.mount_gradio_app(app, gui, path="/")
 
7
  from gtts import gTTS
8
  import tempfile
9
  import os
10
+ import easyocr
 
11
 
12
  app = FastAPI()
13
 
14
+ # Models
 
 
 
15
  caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
16
  vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
17
+ ocr_reader = easyocr.Reader(['en'], gpu=False)
18
 
19
  def process_image_question(image: Image.Image, question: str):
20
  if image is None:
21
  return "No image uploaded.", None
22
 
23
  try:
24
+ # Convert image to numpy
 
 
 
 
 
 
25
  np_image = np.array(image)
26
 
27
+ # OCR text
28
  ocr_texts = ocr_reader.readtext(np_image, detail=0)
29
  extracted_text = "\n".join(ocr_texts)
30
 
31
+ # Caption
32
  caption = caption_model(image)[0]['generated_text']
33
 
34
+ # Visual QA
35
  vqa_result = vqa_model(image=image, question=question)
36
  answer = vqa_result[0]['answer']
37
 
38
+ # Answer as speech
 
 
 
39
  tts = gTTS(text=answer)
40
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
41
  tts.save(tmp.name)
42
  audio_path = tmp.name
43
 
44
+ final_output = f"๐Ÿ–ผ๏ธ Caption: {caption}\n\n๐Ÿ“– OCR Text:\n{extracted_text}\n\nโ“ Answer: {answer}"
45
  return final_output, audio_path
46
 
47
  except Exception as e:
48
+ return f"โŒ Error: {e}", None
49
 
50
  gui = gr.Interface(
51
  fn=process_image_question,
 
58
  gr.Audio(label="Answer (Audio)", type="filepath")
59
  ],
60
  title="๐Ÿง  Image QA with Voice",
61
+ description="Upload an image and ask any question โ€” even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud."
62
  )
63
 
64
  app = gr.mount_gradio_app(app, gui, path="/")