"""from fastapi import FastAPI, UploadFile, Form from fastapi.responses import RedirectResponse, FileResponse, JSONResponse import os import shutil from PIL import Image from transformers import ViltProcessor, ViltForQuestionAnswering from gtts import gTTS import torch import tempfile import gradio as gr app = FastAPI() # Load VQA Model vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") def answer_question_from_image(image, question): if image is None or not question.strip(): return "Please upload an image and ask a question.", None # Process with model inputs = vqa_processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = vqa_model(**inputs) predicted_id = outputs.logits.argmax(-1).item() answer = vqa_model.config.id2label[predicted_id] # Generate TTS audio try: tts = gTTS(text=answer) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) audio_path = tmp.name except Exception as e: return f"Answer: {answer}\n\n⚠️ Audio generation error: {e}", None return answer, audio_path def process_image_question(image: Image.Image, question: str): answer, audio_path = answer_question_from_image(image, question) return answer, audio_path gui = gr.Interface( fn=process_image_question, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") ], outputs=[ gr.Textbox(label="Answer", lines=5), gr.Audio(label="Answer (Audio)", type="filepath") ], title="🧠 Image QA with Voice", description="Upload an image and ask a question. You'll get a text + spoken answer." ) app = gr.mount_gradio_app(app, gui, path="/") @app.get("/") def home(): return RedirectResponse(url="/") """ from fastapi import FastAPI from fastapi.responses import RedirectResponse import tempfile from PIL import Image import torch from transformers import ViltProcessor, ViltForQuestionAnswering, AutoTokenizer, AutoModelForCausalLM from gtts import gTTS import gradio as gr from transformers import AutoModelForSeq2SeqLM app = FastAPI() # Load VQA Model vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") # Load GPT model to rewrite answers gpt_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small") gpt_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small") def rewrite_answer(question, short_answer): prompt = ( f"Question: {question}\n" f"Short Answer: {short_answer}\n" f"Now write a full sentence answering the question:" ) inputs = gpt_tokenizer(prompt, return_tensors="pt") with torch.no_grad(): outputs = gpt_model.generate( **inputs, max_new_tokens=50, do_sample=True, top_p=0.9, temperature=0.7, pad_token_id=gpt_tokenizer.eos_token_id ) generated = gpt_tokenizer.decode(outputs[0], skip_special_tokens=True) if "Now write a full sentence answering the question:" in generated: rewritten = generated.split("Now write a full sentence answering the question:")[-1].strip() else: rewritten = generated.strip() return rewritten def answer_question_from_image(image, question): if image is None or not question.strip(): return "Please upload an image and ask a question.", None inputs = vqa_processor(image, question, return_tensors="pt") with torch.no_grad(): outputs = vqa_model(**inputs) predicted_id = outputs.logits.argmax(-1).item() short_answer = vqa_model.config.id2label[predicted_id] # Rewrite to human-like sentence full_answer = rewrite_answer(question, short_answer) # Convert to speech try: tts = gTTS(text=full_answer) with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: tts.save(tmp.name) audio_path = tmp.name except Exception as e: return f"{full_answer}\n\n⚠️ Audio generation error: {e}", None return full_answer, audio_path def process_image_question(image: Image.Image, question: str): answer, audio_path = answer_question_from_image(image, question) return answer, audio_path # Gradio UI gui = gr.Interface( fn=process_image_question, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") ], outputs=[ gr.Textbox(label="Answer", lines=5), gr.Audio(label="Answer (Audio)", type="filepath") ], title="🧠 Image QA with Voice", description="Upload an image and ask a question. You'll get a human-like spoken answer." ) # Mount on FastAPI app = gr.mount_gradio_app(app, gui, path="/") @app.get("/") def home(): return RedirectResponse(url="/")