import os from PIL import Image import gradio as gr from transformers import pipeline # 1) 파이프라인 초기화 captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") scene_classifier = pipeline( "zero-shot-image-classification", model="openai/clip-vit-base-patch32" ) # 2) 장면 레이블 & 템플릿 정의 SCENE_LABELS = [ "outdoor", "indoor", "beach", "office", "street", "restaurant", "park", "sports", "kitchen", "mountain" ] TEMPLATES = { "outdoor": "In this picture, {caption}. It looks like a pleasant outdoor setting, and the subject seems relaxed.", "indoor": "In this picture, {caption}. It appears to be indoors, perhaps at home or in an office environment.", "beach": "In this picture, {caption}. It seems to be on a beach, and the atmosphere looks warm and sunny.", "office": "In this picture, {caption}. It looks like an office scene, with people engaged in work or discussion.", "street": "In this picture, {caption}. The scene appears to be on a busy street, with vehicles and pedestrians.", "restaurant": "In this picture, {caption}. It looks like a restaurant setting, where people are dining together.", "park": "In this picture, {caption}. The location seems to be a park, with trees and open space.", "sports": "In this picture, {caption}. It appears to be a sports activity, showing movement and action.", "kitchen": "In this picture, {caption}. It seems to be in a kitchen, with cooking utensils visible.", "mountain": "In this picture, {caption}. The background looks like mountains, suggesting a hiking scene." } def generate_caption(image_path): try: # 1) 이미지 불러오기 img = Image.open(image_path).convert("RGB") # 2) 원본 캡션 생성 out = captioner(img) first = out[0] if isinstance(out, list) else out raw = first.get("generated_text") or first.get("text") or str(first) raw = raw.strip() # 3) 장면 분류 cls = scene_classifier(img, candidate_labels=SCENE_LABELS) scene = cls["labels"][0] # 4) 템플릿 매핑 및 리턴 template = TEMPLATES.get(scene, "In this picture, {caption}.") return template.format(caption=raw) except Exception as e: return f"🔴 Error: {e}" # 5) Gradio 인터페이스 정의 with gr.Blocks() as demo: gr.Markdown("## 📸 TOEIC Part 1: 상황별 사진 묘사 (Single Image)") img_in = gr.Image(type="filepath", label="Upload an image") btn = gr.Button("Describe") output = gr.Textbox(label="TOEIC Part 1 Response", lines=4) btn.click(fn=generate_caption, inputs=img_in, outputs=output) # 6) 앱 실행 if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)) )