import os
from PIL import Image
import gradio as gr
from transformers import pipeline

# 1) 파이프라인 초기화
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
scene_classifier = pipeline(
    "zero-shot-image-classification",
    model="openai/clip-vit-base-patch32"
)

# 2) 장면 레이블 & 템플릿 정의
SCENE_LABELS = [
    "outdoor", "indoor", "beach", "office", "street",
    "restaurant", "park", "sports", "kitchen", "mountain"
]
TEMPLATES = {
    "outdoor":    "In this picture, {caption}. It looks like a pleasant outdoor setting, and the subject seems relaxed.",
    "indoor":     "In this picture, {caption}. It appears to be indoors, perhaps at home or in an office environment.",
    "beach":      "In this picture, {caption}. It seems to be on a beach, and the atmosphere looks warm and sunny.",
    "office":     "In this picture, {caption}. It looks like an office scene, with people engaged in work or discussion.",
    "street":     "In this picture, {caption}. The scene appears to be on a busy street, with vehicles and pedestrians.",
    "restaurant": "In this picture, {caption}. It looks like a restaurant setting, where people are dining together.",
    "park":       "In this picture, {caption}. The location seems to be a park, with trees and open space.",
    "sports":     "In this picture, {caption}. It appears to be a sports activity, showing movement and action.",
    "kitchen":    "In this picture, {caption}. It seems to be in a kitchen, with cooking utensils visible.",
    "mountain":   "In this picture, {caption}. The background looks like mountains, suggesting a hiking scene."
}

def generate_caption(image_path):
    try:
        # 1) 이미지 불러오기
        img = Image.open(image_path).convert("RGB")

        # 2) 원본 캡션 생성
        out = captioner(img)
        first = out[0] if isinstance(out, list) else out
        raw = first.get("generated_text") or first.get("text") or str(first)
        raw = raw.strip()

        # 3) 장면 분류
        cls = scene_classifier(img, candidate_labels=SCENE_LABELS)
        scene = cls["labels"][0]

        # 4) 템플릿 매핑 및 리턴
        template = TEMPLATES.get(scene, "In this picture, {caption}.")
        return template.format(caption=raw)

    except Exception as e:
        return f"🔴 Error: {e}"

# 5) Gradio 인터페이스 정의
with gr.Blocks() as demo:
    gr.Markdown("## 📸 TOEIC Part 1: 상황별 사진 묘사 (Single Image)")
    img_in = gr.Image(type="filepath", label="Upload an image")
    btn = gr.Button("Describe")
    output = gr.Textbox(label="TOEIC Part 1 Response", lines=4)
    btn.click(fn=generate_caption, inputs=img_in, outputs=output)

# 6) 앱 실행
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=int(os.environ.get("PORT", 7860))
    )