# app.py
import gradio as gr
import torch
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, pipeline
from gtts import gTTS
import tempfile

class AIDoctor:
    def __init__(self,
                 model_id="RedHatAI/Qwen2.5-VL-7B-Instruct-quantized.w8a8"):
        self.device = "cpu"
        print("⚙️ Using device:", self.device)
        self.proc = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
            model_id,
            torch_dtype=torch.float32,
            trust_remote_code=True
        ).to(self.device)
        self.stt = pipeline("automatic-speech-recognition",
                            model="openai/whisper-tiny",
                            device=-1)

    def analyze(self, image, question):
        if image is None:
            return "Please upload a medical image."
        prompt = question.strip() or "Analyze this medical image for any abnormalities."
        inputs = self.proc(images=image, text=prompt, return_tensors="pt").to(self.device)
        outs = self.model.generate(**inputs,
                                   max_new_tokens=150,
                                   temperature=0.7)
        return self.proc.decode(outs[0], skip_special_tokens=True).strip()

    def tts(self, text):
        tts = gTTS(text=text, lang="en")
        path = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3").name
        tts.save(path)
        return path

    def respond(self, image, audio, text):
        q = text.strip()
        if audio:
            res = self.stt(audio)
            q_upd = res.get("text", "").strip() if isinstance(res, dict) else str(res)
            if q_upd:
                q = q_upd
        resp = self.analyze(image, q)
        voice = self.tts(resp)
        return resp, voice, q

doctor = AIDoctor()

with gr.Blocks(title="🏥 AI Doctor (Qwen 2.5‑VL‑7B INT8)") as demo:
    gr.Markdown("## AI Doctor with **Qwen 2.5‑VL‑7B Instruct (quantized)**, CPU‑ready")
    with gr.Row():
        img = gr.Image(label="Upload Medical Image", type="pil")
        aud = gr.Audio(label="Ask by Voice", type="filepath")
    txt = gr.Textbox(label="Ask by Text", lines=2)
    out_txt = gr.Textbox(label="AI Response", lines=10)
    out_aud = gr.Audio(label="AI Speaks", type="filepath")
    q_out = gr.Textbox(label="Processed Question", lines=1)

    btn = gr.Button("Ask Doctor")
    btn.click(fn=doctor.respond,
              inputs=[img, aud, txt],
              outputs=[out_txt, out_aud, q_out])
demo.launch()