imagetalking / app.py
englissi's picture
Update app.py
73e2c66 verified
import os
from PIL import Image
import gradio as gr
from transformers import pipeline
# 1) ํŒŒ์ดํ”„๋ผ์ธ ์ดˆ๊ธฐํ™”
captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
scene_classifier = pipeline(
"zero-shot-image-classification",
model="openai/clip-vit-base-patch32"
)
# 2) ์žฅ๋ฉด ๋ ˆ์ด๋ธ” & ํ…œํ”Œ๋ฆฟ ์ •์˜
SCENE_LABELS = [
"outdoor", "indoor", "beach", "office", "street",
"restaurant", "park", "sports", "kitchen", "mountain"
]
TEMPLATES = {
"outdoor": "In this picture, {caption}. It looks like a pleasant outdoor setting, and the subject seems relaxed.",
"indoor": "In this picture, {caption}. It appears to be indoors, perhaps at home or in an office environment.",
"beach": "In this picture, {caption}. It seems to be on a beach, and the atmosphere looks warm and sunny.",
"office": "In this picture, {caption}. It looks like an office scene, with people engaged in work or discussion.",
"street": "In this picture, {caption}. The scene appears to be on a busy street, with vehicles and pedestrians.",
"restaurant": "In this picture, {caption}. It looks like a restaurant setting, where people are dining together.",
"park": "In this picture, {caption}. The location seems to be a park, with trees and open space.",
"sports": "In this picture, {caption}. It appears to be a sports activity, showing movement and action.",
"kitchen": "In this picture, {caption}. It seems to be in a kitchen, with cooking utensils visible.",
"mountain": "In this picture, {caption}. The background looks like mountains, suggesting a hiking scene."
}
def generate_caption(image_path):
try:
# 1) ์ด๋ฏธ์ง€ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
img = Image.open(image_path).convert("RGB")
# 2) ์›๋ณธ ์บก์…˜ ์ƒ์„ฑ
out = captioner(img)
first = out[0] if isinstance(out, list) else out
raw = first.get("generated_text") or first.get("text") or str(first)
raw = raw.strip()
# 3) ์žฅ๋ฉด ๋ถ„๋ฅ˜
cls = scene_classifier(img, candidate_labels=SCENE_LABELS)
scene = cls["labels"][0]
# 4) ํ…œํ”Œ๋ฆฟ ๋งคํ•‘ ๋ฐ ๋ฆฌํ„ด
template = TEMPLATES.get(scene, "In this picture, {caption}.")
return template.format(caption=raw)
except Exception as e:
return f"๐Ÿ”ด Error: {e}"
# 5) Gradio ์ธํ„ฐํŽ˜์ด์Šค ์ •์˜
with gr.Blocks() as demo:
gr.Markdown("## ๐Ÿ“ธ TOEIC Partโ€ฏ1: ์ƒํ™ฉ๋ณ„ ์‚ฌ์ง„ ๋ฌ˜์‚ฌ (Single Image)")
img_in = gr.Image(type="filepath", label="Upload an image")
btn = gr.Button("Describe")
output = gr.Textbox(label="TOEIC Partโ€ฏ1 Response", lines=4)
btn.click(fn=generate_caption, inputs=img_in, outputs=output)
# 6) ์•ฑ ์‹คํ–‰
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=int(os.environ.get("PORT", 7860))
)