Spaces:
Runtime error
Runtime error
from PIL import Image | |
from transformers import BlipProcessor, BlipForConditionalGeneration | |
import torch | |
import cv2 | |
import numpy as np | |
from deepface import DeepFace | |
import gradio as gr | |
# ====== ๋ชจ๋ธ ๋ก๋ฉ ====== | |
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") | |
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") | |
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml") | |
# ====== ์ท ์ ๋ณด ์ถ์ถ ํจ์ ====== | |
def extract_clothing(text): | |
colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange'] | |
patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral'] | |
items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts', | |
'suit', 'sneakers', 'hat', 'scarf', 'uniform'] | |
found_colors = [c for c in colors if c in text.lower()] | |
found_patterns = [p for p in patterns if p in text.lower()] | |
found_items = [i for i in text.lower().split() if i in items] | |
return found_colors, found_patterns, found_items | |
# ====== ์ต์ข ์ค๋ช ์์ฑ ํจ์ ====== | |
def generate_15_sentences(caption, num_faces, age_summary, clothing_sentence): | |
sentences = [] | |
sentences.append(f"The image presents the scene: {caption}.") | |
sentences.append("The visual tone combines human presence with context-rich elements.") | |
sentences.append(f"A total of {num_faces} people with visible faces were detected.") | |
if age_summary: | |
summary_list = [f"{v} {k}(s)" for k, v in age_summary.items()] | |
sentences.append("The crowd includes " + ", ".join(summary_list) + ".") | |
else: | |
sentences.append("No specific age or gender details were identified.") | |
sentences.append(clothing_sentence) | |
sentences.append("Facial expressions range from neutral to slightly expressive, adding emotional context.") | |
sentences.append("Some individuals appear to be interacting with the environment or each other.") | |
sentences.append("Although specific facial shapes are not automatically classified here, a mix of face sizes and angles is present.") | |
sentences.append("Hairstyles vary, including short hair, longer cuts, and tied-back styles depending on individual orientation.") | |
sentences.append("The photo captures diversity not only in people but also in visual textures and tones.") | |
sentences.append("Clothing styles vary, suggesting informal or casual settings rather than formal events.") | |
sentences.append("The spatial arrangement of individuals indicates natural movement or candid posture.") | |
sentences.append("Background elements such as buildings or trees provide additional narrative depth.") | |
sentences.append("The lighting helps highlight human features and adds dimensionality to the scene.") | |
sentences.append("Overall, the image blends appearance, age, fashion, and emotion into a coherent story.") | |
return sentences | |
# ====== ๋ฉ์ธ ๋ถ์ ํจ์ ====== | |
def analyze_uploaded_image(image_pil): | |
image_pil = image_pil.convert("RGB") | |
image_np = np.array(image_pil) | |
# 1. Caption ์์ฑ (BLIP) | |
inputs = processor(image_pil, return_tensors="pt") | |
out = model.generate(**inputs) | |
caption = processor.decode(out[0], skip_special_tokens=True) | |
# 2. ์ผ๊ตด ๊ฐ์ง (OpenCV) | |
gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY) | |
faces = face_cascade.detectMultiScale(gray, 1.1, 4) | |
# 3. DeepFace๋ก ์ฐ๋ น/์ฑ๋ณ ๋ถ์ | |
face_infos = [] | |
for (x, y, w, h) in faces: | |
face_crop = image_np[y:y+h, x:x+w] | |
try: | |
analysis = DeepFace.analyze(face_crop, actions=['age', 'gender'], enforce_detection=False) | |
age = analysis[0]['age'] | |
gender = analysis[0]['gender'] | |
if age < 13: | |
age_group = "child" | |
elif age < 20: | |
age_group = "teen" | |
elif age < 60: | |
age_group = "adult" | |
else: | |
age_group = "senior" | |
face_infos.append({ | |
"age_group": age_group, | |
"gender": gender, | |
}) | |
except: | |
continue | |
num_faces = len(face_infos) | |
# 4. ์ฐ๋ น๋ ์์ฝ | |
age_summary = {} | |
for face in face_infos: | |
key = f"{face['gender']} {face['age_group']}" | |
age_summary[key] = age_summary.get(key, 0) + 1 | |
# 5. ์๋ณต ์ ๋ณด ์ถ์ถ | |
colors, patterns, items = extract_clothing(caption) | |
parts = [] | |
if colors: | |
parts.append(f"colors such as {', '.join(colors)}") | |
if patterns: | |
parts.append(f"patterns like {', '.join(patterns)}") | |
if items: | |
parts.append(f"clothing items such as {', '.join(items)}") | |
clothing_sentence = "The clothing observed includes " + " with ".join(parts) + "." if parts else "Clothing is present but not clearly distinguishable." | |
# 6. ์ต์ข ์ค๋ช ์์ฑ | |
final_description = generate_15_sentences(caption, num_faces, age_summary, clothing_sentence) | |
return "\n".join([f"{i+1}. {s}" for i, s in enumerate(final_description)]) | |
# ====== Gradio ์ธํฐํ์ด์ค ์ค์ ====== | |
interface = gr.Interface( | |
fn=analyze_uploaded_image, | |
inputs=gr.Image(type="pil", label="์ด๋ฏธ์ง๋ฅผ ์ ๋ก๋ํ์ธ์"), | |
outputs=gr.Textbox(label="15๋ฌธ์ฅ ์ด๋ฏธ์ง ์ค๋ช "), | |
title="๐ง ์ด๋ฏธ์ง ์ธ์ ์ค๋ช ๊ธฐ (BLIP + DeepFace)", | |
description="์ด๋ฏธ์ง๋ฅผ ์ ๋ก๋ํ๋ฉด ์ฌ๋ ์, ์ฑ๋ณ, ์ฐ๋ น๋, ์ท, ๋ถ์๊ธฐ ๋ฑ์ 15๊ฐ์ ๋ฌธ์ฅ์ผ๋ก ์ค๋ช ํฉ๋๋ค." | |
) | |
# ====== ์ฑ ์คํ ====== | |
interface.launch() | |