Spaces:

ikraamkb
/

Summarization

Sleeping

App Files Files Community

ikraamkb commited on Apr 9

Commit

b298682

verified ·

1 Parent(s): a48ca23

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -22

app.py CHANGED Viewed

@@ -7,56 +7,45 @@ from transformers import pipeline
 from gtts import gTTS
 import tempfile
 import os
 app = FastAPI()
-# OCR Reader (lazy import inside function to avoid ImportError on Spaces)
-ocr_reader = None
-# Captioning and VQA Pipelines
 caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
 vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
 def process_image_question(image: Image.Image, question: str):
     if image is None:
         return "No image uploaded.", None
     try:
-        # Import EasyOCR only when needed
-        global ocr_reader
-        if ocr_reader is None:
-            import easyocr
-            ocr_reader = easyocr.Reader(['en'], gpu=False)
-        # Convert PIL image to numpy array
         np_image = np.array(image)
-        # OCR extraction
         ocr_texts = ocr_reader.readtext(np_image, detail=0)
         extracted_text = "\n".join(ocr_texts)
-        # Generate caption
         caption = caption_model(image)[0]['generated_text']
-        # Ask question on image using VQA
         vqa_result = vqa_model(image=image, question=question)
         answer = vqa_result[0]['answer']
-        # Combine results
-        final_output = f"🖼️ Caption: {caption}\n\n📖 OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"
-        # Convert answer to speech
         tts = gTTS(text=answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
             audio_path = tmp.name
         return final_output, audio_path
     except Exception as e:
-        return f"❌ Error processing image: {e}", None
 gui = gr.Interface(
     fn=process_image_question,
@@ -69,7 +58,7 @@ gui = gr.Interface(
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
-    description="Upload an image and ask any question. Supports OCR, captioning, visual QA, and audio response."
 )
 app = gr.mount_gradio_app(app, gui, path="/")

 from gtts import gTTS
 import tempfile
 import os
+import easyocr
 app = FastAPI()
+# Models
 caption_model = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")
 vqa_model = pipeline("visual-question-answering", model="dandelin/vilt-b32-finetuned-vqa")
+ocr_reader = easyocr.Reader(['en'], gpu=False)
 def process_image_question(image: Image.Image, question: str):
     if image is None:
         return "No image uploaded.", None
     try:
+        # Convert image to numpy
         np_image = np.array(image)
+        # OCR text
         ocr_texts = ocr_reader.readtext(np_image, detail=0)
         extracted_text = "\n".join(ocr_texts)
+        # Caption
         caption = caption_model(image)[0]['generated_text']
+        # Visual QA
         vqa_result = vqa_model(image=image, question=question)
         answer = vqa_result[0]['answer']
+        # Answer as speech
         tts = gTTS(text=answer)
         with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp:
             tts.save(tmp.name)
             audio_path = tmp.name
+        final_output = f"🖼️ Caption: {caption}\n\n📖 OCR Text:\n{extracted_text}\n\n❓ Answer: {answer}"
         return final_output, audio_path
     except Exception as e:
+        return f"❌ Error: {e}", None
 gui = gr.Interface(
     fn=process_image_question,
         gr.Audio(label="Answer (Audio)", type="filepath")
     ],
     title="🧠 Image QA with Voice",
+    description="Upload an image and ask any question — even if there's no readable text. The app will use OCR, captioning, visual QA, and read answers aloud."
 )
 app = gr.mount_gradio_app(app, gui, path="/")