"""from fastapi import FastAPI, UploadFile, Form from fastapi.responses import RedirectResponse, FileResponse, JSONResponse import os import shutil from PIL import Image from transformers import ViltProcessor, ViltForQuestionAnswering from gtts import gTTS import torch import tempfile import gradio as gr app = FastAPI() # Load VQA Model vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") def answer_question_from_image(image, question): if image is None or not question.strip(): return "Please upload an image and ask a question.", None # Process with model inputs = vqa_processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = vqa_model(**inputs) predicted_id = outputs.logits.argmax(-1).item() answer = vqa_model.config.id2label[predicted_id] # Generate TTS audio try: tts = gTTS(text=answer) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) audio_path = tmp.name except Exception as e: return f"Answer: {answer}\n\n⚠️ Audio generation error: {e}", None return answer, audio_path def process_image_question(image: Image.Image, question: str): answer, audio_path = answer_question_from_image(image, question) return answer, audio_path gui = gr.Interface( fn=process_image_question, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") ], outputs=[ gr.Textbox(label="Answer", lines=5), gr.Audio(label="Answer (Audio)", type="filepath") ], title="🧠 Image QA with Voice", description="Upload an image and ask a question. You'll get a text + spoken answer." ) app = gr.mount_gradio_app(app, gui, path="/") @app.get("/") def home(): return RedirectResponse(url="/") """ from fastapi import FastAPI, UploadFile, Form from fastapi.responses import RedirectResponse, FileResponse, JSONResponse import os import shutil from PIL import Image from transformers import ViltProcessor, ViltForQuestionAnswering from gtts import gTTS import torch import tempfile import gradio as gr app = FastAPI() # Load VQA Model vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") def answer_question_from_image(image, question): if image is None or not question.strip(): return "Please upload an image and ask a question.", None inputs = vqa_processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = vqa_model(**inputs) predicted_id = outputs.logits.argmax(-1).item() short_answer = vqa_model.config.id2label[predicted_id] try: tts = gTTS(text=short_answer) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) audio_path = tmp.name except Exception as e: return f"Answer: {short_answer}\n\n⚠️ Audio generation error: {e}", None return short_answer, audio_path def process_image_question(image: Image.Image, question: str): answer, audio_path = answer_question_from_image(image, question) return answer, audio_path gui = gr.Interface( fn=process_image_question, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") ], outputs=[ gr.Textbox(label="Answer", lines=5), gr.Audio(label="Answer (Audio)", type="filepath") ], title="🧠 Image QA with Voice", description="Upload an image and ask a question. You'll get an answer spoken out loud." ) app = gr.mount_gradio_app(app, gui, path="/") @app.get("/") def home(): return RedirectResponse(url="/")