File size: 5,642 Bytes
6affcd7
 
3785464
6affcd7
 
 
2a2fa4c
6affcd7
2a2fa4c
6affcd7
 
 
 
2a2fa4c
6affcd7
 
 
 
 
 
 
 
2a2fa4c
6affcd7
 
 
2a2fa4c
 
6affcd7
 
 
 
 
 
 
 
 
 
 
2a2fa4c
6affcd7
 
 
 
 
 
 
 
 
 
 
 
2a2fa4c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a43544
2a2fa4c
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import cv2
import numpy as np
from deepface import DeepFace
import gradio as gr

# ====== ๋ชจ๋ธ ๋กœ๋”ฉ ======
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")

# ====== ์˜ท ์ •๋ณด ์ถ”์ถœ ํ•จ์ˆ˜ ======
def extract_clothing(text):
    colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
    patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
    items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
             'suit', 'sneakers', 'hat', 'scarf', 'uniform']
    
    found_colors = [c for c in colors if c in text.lower()]
    found_patterns = [p for p in patterns if p in text.lower()]
    found_items = [i for i in text.lower().split() if i in items]
    
    return found_colors, found_patterns, found_items

# ====== ์ตœ์ข… ์„ค๋ช… ์ƒ์„ฑ ํ•จ์ˆ˜ ======
def generate_15_sentences(caption, num_faces, age_summary, clothing_sentence):
    sentences = []
    sentences.append(f"The image presents the scene: {caption}.")
    sentences.append("The visual tone combines human presence with context-rich elements.")
    sentences.append(f"A total of {num_faces} people with visible faces were detected.")
    
    if age_summary:
        summary_list = [f"{v} {k}(s)" for k, v in age_summary.items()]
        sentences.append("The crowd includes " + ", ".join(summary_list) + ".")
    else:
        sentences.append("No specific age or gender details were identified.")
    
    sentences.append(clothing_sentence)
    sentences.append("Facial expressions range from neutral to slightly expressive, adding emotional context.")
    sentences.append("Some individuals appear to be interacting with the environment or each other.")
    sentences.append("Although specific facial shapes are not automatically classified here, a mix of face sizes and angles is present.")
    sentences.append("Hairstyles vary, including short hair, longer cuts, and tied-back styles depending on individual orientation.")
    sentences.append("The photo captures diversity not only in people but also in visual textures and tones.")
    sentences.append("Clothing styles vary, suggesting informal or casual settings rather than formal events.")
    sentences.append("The spatial arrangement of individuals indicates natural movement or candid posture.")
    sentences.append("Background elements such as buildings or trees provide additional narrative depth.")
    sentences.append("The lighting helps highlight human features and adds dimensionality to the scene.")
    sentences.append("Overall, the image blends appearance, age, fashion, and emotion into a coherent story.")
    return sentences

# ====== ๋ฉ”์ธ ๋ถ„์„ ํ•จ์ˆ˜ ======
def analyze_uploaded_image(image_pil):
    image_pil = image_pil.convert("RGB")
    image_np = np.array(image_pil)

    # 1. Caption ์ƒ์„ฑ (BLIP)
    inputs = processor(image_pil, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    # 2. ์–ผ๊ตด ๊ฐ์ง€ (OpenCV)
    gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)

    # 3. DeepFace๋กœ ์—ฐ๋ น/์„ฑ๋ณ„ ๋ถ„์„
    face_infos = []
    for (x, y, w, h) in faces:
        face_crop = image_np[y:y+h, x:x+w]
        try:
            analysis = DeepFace.analyze(face_crop, actions=['age', 'gender'], enforce_detection=False)
            age = analysis[0]['age']
            gender = analysis[0]['gender']
            if age < 13:
                age_group = "child"
            elif age < 20:
                age_group = "teen"
            elif age < 60:
                age_group = "adult"
            else:
                age_group = "senior"
            face_infos.append({
                "age_group": age_group,
                "gender": gender,
            })
        except:
            continue

    num_faces = len(face_infos)

    # 4. ์—ฐ๋ น๋Œ€ ์š”์•ฝ
    age_summary = {}
    for face in face_infos:
        key = f"{face['gender']} {face['age_group']}"
        age_summary[key] = age_summary.get(key, 0) + 1

    # 5. ์˜๋ณต ์ •๋ณด ์ถ”์ถœ
    colors, patterns, items = extract_clothing(caption)
    parts = []
    if colors:
        parts.append(f"colors such as {', '.join(colors)}")
    if patterns:
        parts.append(f"patterns like {', '.join(patterns)}")
    if items:
        parts.append(f"clothing items such as {', '.join(items)}")
    clothing_sentence = "The clothing observed includes " + " with ".join(parts) + "." if parts else "Clothing is present but not clearly distinguishable."

    # 6. ์ตœ์ข… ์„ค๋ช… ์ƒ์„ฑ
    final_description = generate_15_sentences(caption, num_faces, age_summary, clothing_sentence)
    return "\n".join([f"{i+1}. {s}" for i, s in enumerate(final_description)])

# ====== Gradio ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ • ======
interface = gr.Interface(
    fn=analyze_uploaded_image,
    inputs=gr.Image(type="pil", label="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”"),
    outputs=gr.Textbox(label="15๋ฌธ์žฅ ์ด๋ฏธ์ง€ ์„ค๋ช…"),
    title="๐Ÿง  ์ด๋ฏธ์ง€ ์ธ์‹ ์„ค๋ช…๊ธฐ (BLIP + DeepFace)",
    description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด ์‚ฌ๋žŒ ์ˆ˜, ์„ฑ๋ณ„, ์—ฐ๋ น๋Œ€, ์˜ท, ๋ถ„์œ„๊ธฐ ๋“ฑ์„ 15๊ฐœ์˜ ๋ฌธ์žฅ์œผ๋กœ ์„ค๋ช…ํ•ฉ๋‹ˆ๋‹ค."
)

# ====== ์•ฑ ์‹คํ–‰ ======
interface.launch()