Spaces:

englissi
/

imagetalking

Sleeping

App Files Files Community

englissi commited on 16 days ago

Commit

dfd8ef3

verified ·

1 Parent(s): 16ac547

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -38

app.py CHANGED Viewed

@@ -1,48 +1,57 @@
 import gradio as gr
 from transformers import pipeline
-# 1) 이미지 캡셔닝 파이프라인 초기화
 captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-def generate_caption(image, style):
-    # 1) 원본 캡션 얻기
-    output = captioner(image)
-    # output 예시: [{"generated_text": "..."}] 또는 [{"text": "..."}] 혹은 ["..."]
-    # 안전하게 추출
-    if isinstance(output, list) and output:
-        first = output[0]
-        if isinstance(first, dict):
-            raw_caption = first.get("generated_text") or first.get("text") or ""
-        else:
-            raw_caption = str(first)
-    else:
-        raw_caption = str(output)
-    raw_caption = raw_caption.strip()
-    # 2) 스타일 변환
-    if style == "TOEIC Speaking Part 1":
-        return f"Q: What do you see in the picture?\nA: {raw_caption.capitalize()}."
-    elif style == "IELTS Describe a Photo":
-        return (
-            "Describe the photo in two sentences:\n"
-            f"1. {raw_caption.capitalize()}.\n"
-            "2. It also shows the context of daily life."
-        )
-    else:
-        return raw_caption
 with gr.Blocks() as demo:
-    gr.Markdown("## 📸 이미지 캡셔닝 → English Test 스타일 문장 생성")
-    with gr.Row():
-        img_in = gr.Image(type="pil", label="Upload Image")
-        style_sel = gr.Dropdown(
-            choices=["Raw Caption", "TOEIC Speaking Part 1", "IELTS Describe a Photo"],
-            value="TOEIC Speaking Part 1",
-            label="시험 형식 선택"
-        )
-    output = gr.Textbox(label="Generated Caption", lines=4)
-    btn = gr.Button("Generate")
-    btn.click(fn=generate_caption, inputs=[img_in, style_sel], outputs=output)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 from transformers import pipeline
 captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+scene_classifier = pipeline(
+    "zero-shot-image-classification",
+    model="openai/clip-vit-base-patch32"
+)
+SCENE_LABELS = ["outdoor", "indoor", "beach", "office", "street",
+                "restaurant", "park", "sports", "kitchen", "mountain"]
+TEMPLATES = {
+    "outdoor":    "In this picture, {caption}. It looks like a pleasant outdoor setting, and the subject seems relaxed.",
+    "indoor":     "In this picture, {caption}. It appears to be indoors, perhaps at home or in an office environment.",
+    "beach":      "In this picture, {caption}. It seems to be on a beach, and the atmosphere looks warm and sunny.",
+    "office":     "In this picture, {caption}. It looks like an office scene, with people engaged in work or discussion.",
+    "street":     "In this picture, {caption}. The scene appears to be on a busy street, with vehicles and pedestrians.",
+    "restaurant": "In this picture, {caption}. It looks like a restaurant setting, where people are dining together.",
+    "park":       "In this picture, {caption}. The location seems to be a park, with trees and open space.",
+    "sports":     "In this picture, {caption}. It appears to be a sports activity, showing movement and action.",
+    "kitchen":    "In this picture, {caption}. It seems to be in a kitchen, with cooking utensils visible.",
+    "mountain":   "In this picture, {caption}. The background looks like mountains, suggesting a hiking scene."
+}
+def generate_caption(images, choice_index):
+    idx = int(choice_index)
+    img = images[idx]
+    # raw caption
+    out = captioner(img)
+    first = out[0] if isinstance(out, list) else out
+    raw = first.get("generated_text") or first.get("text") or str(first)
+    raw = raw.strip()
+    # scene classification
+    cls = scene_classifier(img, candidate_labels=SCENE_LABELS)
+    scene = cls["labels"][0]
+    # template mapping
+    template = TEMPLATES.get(scene, "In this picture, {caption}.")
+    return template.format(caption=raw)
 with gr.Blocks() as demo:
+    gr.Markdown("## 📸 TOEIC Part 1: 상황별 사진 묘사")
+    img_inputs = gr.Files(file_count="multiple", type="pil",
+                          label="Upload up to 4 images")
+    choice = gr.Dropdown(
+        choices=[str(i) for i in range(4)],
+        value="0",
+        label="Which image to describe? (0–3)"
+    )
+    btn = gr.Button("Describe")
+    output = gr.Textbox(label="TOEIC Part 1 Response", lines=4)
+    btn.click(fn=generate_caption, inputs=[img_inputs, choice], outputs=output)
 if __name__ == "__main__":
     demo.launch()