Spaces:

Futuretop
/

CaricatureGenerator-4.0

Runtime error

App Files Files Community

Futuretop commited on May 18

Commit

4f843fa

verified ·

1 Parent(s): 2a2fa4c

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -68

app.py CHANGED Viewed

@@ -6,72 +6,47 @@ import numpy as np
 from deepface import DeepFace
 import gradio as gr
-# ====== 모델 로딩 ======
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
-# ====== 옷 정보 추출 함수 ======
 def extract_clothing(text):
     colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
     patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
     items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
              'suit', 'sneakers', 'hat', 'scarf', 'uniform']
     found_colors = [c for c in colors if c in text.lower()]
-    found_patterns = [p for p in patterns if p in text.lower()]
-    found_items = [i for i in text.lower().split() if i in items]
     return found_colors, found_patterns, found_items
-# ====== 최종 설명 생성 함수 ======
-def generate_15_sentences(caption, num_faces, age_summary, clothing_sentence):
-    sentences = []
-    sentences.append(f"The image presents the scene: {caption}.")
-    sentences.append("The visual tone combines human presence with context-rich elements.")
-    sentences.append(f"A total of {num_faces} people with visible faces were detected.")
-    if age_summary:
-        summary_list = [f"{v} {k}(s)" for k, v in age_summary.items()]
-        sentences.append("The crowd includes " + ", ".join(summary_list) + ".")
-    else:
-        sentences.append("No specific age or gender details were identified.")
-    sentences.append(clothing_sentence)
-    sentences.append("Facial expressions range from neutral to slightly expressive, adding emotional context.")
-    sentences.append("Some individuals appear to be interacting with the environment or each other.")
-    sentences.append("Although specific facial shapes are not automatically classified here, a mix of face sizes and angles is present.")
-    sentences.append("Hairstyles vary, including short hair, longer cuts, and tied-back styles depending on individual orientation.")
-    sentences.append("The photo captures diversity not only in people but also in visual textures and tones.")
-    sentences.append("Clothing styles vary, suggesting informal or casual settings rather than formal events.")
-    sentences.append("The spatial arrangement of individuals indicates natural movement or candid posture.")
-    sentences.append("Background elements such as buildings or trees provide additional narrative depth.")
-    sentences.append("The lighting helps highlight human features and adds dimensionality to the scene.")
-    sentences.append("Overall, the image blends appearance, age, fashion, and emotion into a coherent story.")
-    return sentences
-# ====== 메인 분석 함수 ======
-def analyze_uploaded_image(image_pil):
     image_pil = image_pil.convert("RGB")
     image_np = np.array(image_pil)
-    # 1. Caption 생성 (BLIP)
     inputs = processor(image_pil, return_tensors="pt")
     out = model.generate(**inputs)
     caption = processor.decode(out[0], skip_special_tokens=True)
-    # 2. 얼굴 감지 (OpenCV)
     gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
     faces = face_cascade.detectMultiScale(gray, 1.1, 4)
-    # 3. DeepFace로 연령/성별 분석
     face_infos = []
     for (x, y, w, h) in faces:
         face_crop = image_np[y:y+h, x:x+w]
         try:
-            analysis = DeepFace.analyze(face_crop, actions=['age', 'gender'], enforce_detection=False)
             age = analysis[0]['age']
             gender = analysis[0]['gender']
             if age < 13:
                 age_group = "child"
             elif age < 20:
@@ -80,44 +55,95 @@ def analyze_uploaded_image(image_pil):
                 age_group = "adult"
             else:
                 age_group = "senior"
             face_infos.append({
-                "age_group": age_group,
                 "gender": gender,
             })
-        except:
             continue
     num_faces = len(face_infos)
-    # 4. 연령대 요약
     age_summary = {}
     for face in face_infos:
-        key = f"{face['gender']} {face['age_group']}"
-        age_summary[key] = age_summary.get(key, 0) + 1
-    # 5. 의복 정보 추출
     colors, patterns, items = extract_clothing(caption)
-    parts = []
-    if colors:
-        parts.append(f"colors such as {', '.join(colors)}")
-    if patterns:
-        parts.append(f"patterns like {', '.join(patterns)}")
-    if items:
-        parts.append(f"clothing items such as {', '.join(items)}")
-    clothing_sentence = "The clothing observed includes " + " with ".join(parts) + "." if parts else "Clothing is present but not clearly distinguishable."
-    # 6. 최종 설명 생성
-    final_description = generate_15_sentences(caption, num_faces, age_summary, clothing_sentence)
-    return "\n".join([f"{i+1}. {s}" for i, s in enumerate(final_description)])
-# ====== Gradio 인터페이스 설정 ======
-interface = gr.Interface(
-    fn=analyze_uploaded_image,
-    inputs=gr.Image(type="pil", label="이미지를 업로드하세요"),
-    outputs=gr.Textbox(label="15문장 이미지 설명"),
-    title="🧠 이미지 인식 설명기 (BLIP + DeepFace)",
-    description="이미지를 업로드하면 사람 수, 성별, 연령대, 옷, 분위기 등을 15개의 문장으로 설명합니다."
 )
-# ====== 앱 실행 ======
-interface.launch()

 from deepface import DeepFace
 import gradio as gr
+# Load BLIP model
 processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Clothing extractor
 def extract_clothing(text):
     colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
     patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
     items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
              'suit', 'sneakers', 'hat', 'scarf', 'uniform']
     found_colors = [c for c in colors if c in text.lower()]
+    found_patterns = [p for p in patterns if c in text.lower()]
+    found_items = [i for i in items if i in text.lower()]
     return found_colors, found_patterns, found_items
+# Main function
+def analyze_image(image_pil):
     image_pil = image_pil.convert("RGB")
     image_np = np.array(image_pil)
+    # Caption generation
     inputs = processor(image_pil, return_tensors="pt")
     out = model.generate(**inputs)
     caption = processor.decode(out[0], skip_special_tokens=True)
+    # Face detection
+    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
     gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
     faces = face_cascade.detectMultiScale(gray, 1.1, 4)
     face_infos = []
     for (x, y, w, h) in faces:
         face_crop = image_np[y:y+h, x:x+w]
         try:
+            analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False)
             age = analysis[0]['age']
             gender = analysis[0]['gender']
+            emotion = analysis[0]['dominant_emotion']
             if age < 13:
                 age_group = "child"
             elif age < 20:
                 age_group = "adult"
             else:
                 age_group = "senior"
             face_infos.append({
+                "age": age,
                 "gender": gender,
+                "age_group": age_group,
+                "emotion": emotion
             })
+        except Exception:
             continue
+    # Summary stats
     num_faces = len(face_infos)
+    gender_counts = {"Man": 0, "Woman": 0}
     age_summary = {}
+    emotion_summary = {}
     for face in face_infos:
+        gender = face['gender']
+        age_group = face['age_group']
+        emotion = face['emotion']
+        gender_counts[gender] += 1
+        age_summary[age_group] = age_summary.get(age_group, 0) + 1
+        emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1
+    # Clothing info from caption
     colors, patterns, items = extract_clothing(caption)
+    # Generate 15 sentences
+    sentences = []
+    sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".")
+    sentences.append(f"The image contains {num_faces} visible face(s) detected by OpenCV.")
+    gender_desc = []
+    if gender_counts["Man"] > 0:
+        gender_desc.append(f"{gender_counts['Man']} male(s)")
+    if gender_counts["Woman"] > 0:
+        gender_desc.append(f"{gender_counts['Woman']} female(s)")
+    if gender_desc:
+        sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".")
+    else:
+        sentences.append("Gender analysis was inconclusive.")
+    if age_summary:
+        age_list = [f"{count} {group}(s)" for group, count in age_summary.items()]
+        sentences.append("Age groups represented include " + ", ".join(age_list) + ".")
+    else:
+        sentences.append("No conclusive age groupings found.")
+    if emotion_summary:
+        emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()]
+        sentences.append("Facial expressions include " + ", ".join(emo_list) + ".")
+    else:
+        sentences.append("Emotion detection yielded limited results.")
+    if colors or patterns or items:
+        cloth_parts = []
+        if colors:
+            cloth_parts.append(f"colors like {', '.join(colors)}")
+        if patterns:
+            cloth_parts.append(f"patterns such as {', '.join(patterns)}")
+        if items:
+            cloth_parts.append(f"items like {', '.join(items)}")
+        sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".")
+    else:
+        sentences.append("Clothing details were not clearly identified.")
+    if num_faces > 0:
+        sentences.append("Faces are distributed naturally across the image.")
+        sentences.append("Differences in face size suggest variation in distance from the camera.")
+        sentences.append("Hairstyles appear diverse, from short to tied-back styles.")
+        sentences.append("Lighting emphasizes certain facial features and expressions.")
+        sentences.append("Some individuals face the camera while others look away.")
+        sentences.append("Mood diversity is reflected in the variety of facial expressions.")
+        sentences.append("The clothing style appears casual or semi-formal.")
+    else:
+        sentences.append("No visible faces were found to analyze further visual characteristics.")
+    sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.")
+    return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])
+# Gradio Interface
+demo = gr.Interface(
+    fn=analyze_image,
+    inputs=gr.Image(type="pil"),
+    outputs=gr.Textbox(label="📝 15-Sentence Detailed Description"),
+    title="🖼️ Image Analysis with BLIP + DeepFace",
+    description="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more."
 )
+demo.launch()