Spaces:
Sleeping
Sleeping
| """from fastapi import FastAPI, UploadFile, Form | |
| from fastapi.responses import RedirectResponse, FileResponse, JSONResponse | |
| import os | |
| import shutil | |
| from PIL import Image | |
| from transformers import ViltProcessor, ViltForQuestionAnswering | |
| from gtts import gTTS | |
| import torch | |
| import tempfile | |
| import gradio as gr | |
| app = FastAPI() | |
| # Load VQA Model | |
| vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") | |
| vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") | |
| def answer_question_from_image(image, question): | |
| if image is None or not question.strip(): | |
| return "Please upload an image and ask a question.", None | |
| # Process with model | |
| inputs = vqa_processor(image, question, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = vqa_model(**inputs) | |
| predicted_id = outputs.logits.argmax(-1).item() | |
| answer = vqa_model.config.id2label[predicted_id] | |
| # Generate TTS audio | |
| try: | |
| tts = gTTS(text=answer) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: | |
| tts.save(tmp.name) | |
| audio_path = tmp.name | |
| except Exception as e: | |
| return f"Answer: {answer}\n\n⚠️ Audio generation error: {e}", None | |
| return answer, audio_path | |
| def process_image_question(image: Image.Image, question: str): | |
| answer, audio_path = answer_question_from_image(image, question) | |
| return answer, audio_path | |
| gui = gr.Interface( | |
| fn=process_image_question, | |
| inputs=[ | |
| gr.Image(type="pil", label="Upload Image"), | |
| gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Answer", lines=5), | |
| gr.Audio(label="Answer (Audio)", type="filepath") | |
| ], | |
| title="🧠 Image QA with Voice", | |
| description="Upload an image and ask a question. You'll get a text + spoken answer." | |
| ) | |
| app = gr.mount_gradio_app(app, gui, path="/") | |
| @app.get("/") | |
| def home(): | |
| return RedirectResponse(url="/") """ | |
| from fastapi import FastAPI | |
| from fastapi.responses import RedirectResponse | |
| from PIL import Image | |
| from transformers import ( | |
| ViltProcessor, ViltForQuestionAnswering, | |
| T5Tokenizer, T5ForConditionalGeneration | |
| ) | |
| from gtts import gTTS | |
| import torch | |
| import tempfile | |
| import gradio as gr | |
| app = FastAPI() | |
| # VQA Model | |
| vqa_processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa") | |
| vqa_model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa") | |
| # Text Rewriter (FLAN-T5-base) | |
| rewrite_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base") | |
| rewrite_model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base") | |
| def rewrite_answer(question, short_answer): | |
| prompt = f"Answer the question '{question}' with a complete sentence using this answer: '{short_answer}'" | |
| inputs = rewrite_tokenizer(prompt, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = rewrite_model.generate(**inputs, max_new_tokens=50) | |
| return rewrite_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| def answer_question_from_image(image, question): | |
| if image is None or not question.strip(): | |
| return "Please upload an image and ask a question.", None | |
| inputs = vqa_processor(image, question, return_tensors="pt") | |
| with torch.no_grad(): | |
| outputs = vqa_model(**inputs) | |
| predicted_id = outputs.logits.argmax(-1).item() | |
| short_answer = vqa_model.config.id2label[predicted_id] | |
| # Rewrite to full sentence | |
| full_answer = rewrite_answer(question, short_answer) | |
| try: | |
| tts = gTTS(text=full_answer) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp: | |
| tts.save(tmp.name) | |
| return full_answer, tmp.name | |
| except Exception as e: | |
| return f"{full_answer}\n\n⚠️ Audio generation error: {e}", None | |
| def process_image_question(image: Image.Image, question: str): | |
| return answer_question_from_image(image, question) | |
| # Gradio UI | |
| gui = gr.Interface( | |
| fn=process_image_question, | |
| inputs=[ | |
| gr.Image(type="pil", label="Upload Image"), | |
| gr.Textbox(lines=2, placeholder="Ask a question about the image...", label="Question") | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Answer", lines=5), | |
| gr.Audio(label="Answer (Audio)", type="filepath") | |
| ], | |
| title="🧠 Image QA with Voice", | |
| description="Upload an image and ask a question. You'll get a full-sentence spoken answer." | |
| ) | |
| app = gr.mount_gradio_app(app, gui, path="/") | |
| def home(): | |
| return RedirectResponse(url="/") | |