"""from fastapi import FastAPI, UploadFile, Form from fastapi.responses import RedirectResponse, FileResponse, JSONResponse import os import shutil from PIL import Image from transformers import ViltProcessor, ViltForQuestionAnswering from gtts import gTTS import torch import tempfile import gradio as gr app = FastAPI() # Load VQA Model vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") def answer_question_from_image(image, question): if image is None or not question.strip(): return "Please upload an image and ask a question.", None # Process with model inputs = vqa_processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = vqa_model(**inputs) predicted_id = outputs.logits.argmax(-1).item() answer = vqa_model.config.id2label[predicted_id] # Generate TTS audio try: tts = gTTS(text=answer) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) audio_path = tmp.name except Exception as e: return f"Answer: {answer}\n\n⚠️ Audio generation error: {e}", None return answer, audio_path def process_image_question(image: Image.Image, question: str): answer, audio_path = answer_question_from_image(image, question) return answer, audio_path gui = gr.Interface( fn=process_image_question, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") ], outputs=[ gr.Textbox(label="Answer", lines=5), gr.Audio(label="Answer (Audio)", type="filepath") ], title="🧠 Image QA with Voice", description="Upload an image and ask a question. You'll get a text + spoken answer." ) app = gr.mount_gradio_app(app, gui, path="/") @app.get("/") def home(): return RedirectResponse(url="/") """ from fastapi import FastAPI from fastapi.responses import RedirectResponse from PIL import Image from transformers import ( ViltProcessor, ViltForQuestionAnswering, T5Tokenizer, T5ForConditionalGeneration ) from gtts import gTTS import torch import tempfile import gradio as gr app = FastAPI() # VQA Model vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") # Text Rewriter (FLAN-T5-base) rewrite_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base") rewrite_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") def rewrite_answer(question, short_answer): prompt = f"Answer the question '{question}' with a complete sentence using this answer: '{short_answer}'" inputs = rewrite_tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = rewrite_model.generate(**inputs, max_new_tokens=50) return rewrite_tokenizer.decode(outputs[0], skip_special_tokens=True) def answer_question_from_image(image, question): if image is None or not question.strip(): return "Please upload an image and ask a question.", None inputs = vqa_processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = vqa_model(**inputs) predicted_id = outputs.logits.argmax(-1).item() short_answer = vqa_model.config.id2label[predicted_id] # Rewrite to full sentence full_answer = rewrite_answer(question, short_answer) try: tts = gTTS(text=full_answer) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) return full_answer, tmp.name except Exception as e: return f"{full_answer}\n\n⚠️ Audio generation error: {e}", None def process_image_question(image: Image.Image, question: str): return answer_question_from_image(image, question) # Gradio UI gui = gr.Interface( fn=process_image_question, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") ], outputs=[ gr.Textbox(label="Answer", lines=5), gr.Audio(label="Answer (Audio)", type="filepath") ], title="🧠 Image QA with Voice", description="Upload an image and ask a question. You'll get a full-sentence spoken answer." ) app = gr.mount_gradio_app(app, gui, path="/") @app.get("/") def home(): return RedirectResponse(url="/")