Futuretop commited on
Commit
4f843fa
ยท
verified ยท
1 Parent(s): 2a2fa4c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -68
app.py CHANGED
@@ -6,72 +6,47 @@ import numpy as np
6
  from deepface import DeepFace
7
  import gradio as gr
8
 
9
- # ====== ๋ชจ๋ธ ๋กœ๋”ฉ ======
10
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
11
  model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
12
- face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
13
 
14
- # ====== ์˜ท ์ •๋ณด ์ถ”์ถœ ํ•จ์ˆ˜ ======
15
  def extract_clothing(text):
16
  colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
17
  patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
18
  items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
19
  'suit', 'sneakers', 'hat', 'scarf', 'uniform']
20
-
21
  found_colors = [c for c in colors if c in text.lower()]
22
- found_patterns = [p for p in patterns if p in text.lower()]
23
- found_items = [i for i in text.lower().split() if i in items]
24
-
25
  return found_colors, found_patterns, found_items
26
 
27
- # ====== ์ตœ์ข… ์„ค๋ช… ์ƒ์„ฑ ํ•จ์ˆ˜ ======
28
- def generate_15_sentences(caption, num_faces, age_summary, clothing_sentence):
29
- sentences = []
30
- sentences.append(f"The image presents the scene: {caption}.")
31
- sentences.append("The visual tone combines human presence with context-rich elements.")
32
- sentences.append(f"A total of {num_faces} people with visible faces were detected.")
33
-
34
- if age_summary:
35
- summary_list = [f"{v} {k}(s)" for k, v in age_summary.items()]
36
- sentences.append("The crowd includes " + ", ".join(summary_list) + ".")
37
- else:
38
- sentences.append("No specific age or gender details were identified.")
39
-
40
- sentences.append(clothing_sentence)
41
- sentences.append("Facial expressions range from neutral to slightly expressive, adding emotional context.")
42
- sentences.append("Some individuals appear to be interacting with the environment or each other.")
43
- sentences.append("Although specific facial shapes are not automatically classified here, a mix of face sizes and angles is present.")
44
- sentences.append("Hairstyles vary, including short hair, longer cuts, and tied-back styles depending on individual orientation.")
45
- sentences.append("The photo captures diversity not only in people but also in visual textures and tones.")
46
- sentences.append("Clothing styles vary, suggesting informal or casual settings rather than formal events.")
47
- sentences.append("The spatial arrangement of individuals indicates natural movement or candid posture.")
48
- sentences.append("Background elements such as buildings or trees provide additional narrative depth.")
49
- sentences.append("The lighting helps highlight human features and adds dimensionality to the scene.")
50
- sentences.append("Overall, the image blends appearance, age, fashion, and emotion into a coherent story.")
51
- return sentences
52
-
53
- # ====== ๋ฉ”์ธ ๋ถ„์„ ํ•จ์ˆ˜ ======
54
- def analyze_uploaded_image(image_pil):
55
  image_pil = image_pil.convert("RGB")
56
  image_np = np.array(image_pil)
57
 
58
- # 1. Caption ์ƒ์„ฑ (BLIP)
59
  inputs = processor(image_pil, return_tensors="pt")
60
  out = model.generate(**inputs)
61
  caption = processor.decode(out[0], skip_special_tokens=True)
62
 
63
- # 2. ์–ผ๊ตด ๊ฐ์ง€ (OpenCV)
 
64
  gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
65
  faces = face_cascade.detectMultiScale(gray, 1.1, 4)
66
 
67
- # 3. DeepFace๋กœ ์—ฐ๋ น/์„ฑ๋ณ„ ๋ถ„์„
68
  face_infos = []
69
  for (x, y, w, h) in faces:
70
  face_crop = image_np[y:y+h, x:x+w]
71
  try:
72
- analysis = DeepFace.analyze(face_crop, actions=['age', 'gender'], enforce_detection=False)
73
  age = analysis[0]['age']
74
  gender = analysis[0]['gender']
 
 
75
  if age < 13:
76
  age_group = "child"
77
  elif age < 20:
@@ -80,44 +55,95 @@ def analyze_uploaded_image(image_pil):
80
  age_group = "adult"
81
  else:
82
  age_group = "senior"
 
83
  face_infos.append({
84
- "age_group": age_group,
85
  "gender": gender,
 
 
86
  })
87
- except:
88
  continue
89
 
 
90
  num_faces = len(face_infos)
91
-
92
- # 4. ์—ฐ๋ น๋Œ€ ์š”์•ฝ
93
  age_summary = {}
 
 
94
  for face in face_infos:
95
- key = f"{face['gender']} {face['age_group']}"
96
- age_summary[key] = age_summary.get(key, 0) + 1
 
97
 
98
- # 5. ์˜๋ณต ์ •๋ณด ์ถ”์ถœ
 
 
 
 
99
  colors, patterns, items = extract_clothing(caption)
100
- parts = []
101
- if colors:
102
- parts.append(f"colors such as {', '.join(colors)}")
103
- if patterns:
104
- parts.append(f"patterns like {', '.join(patterns)}")
105
- if items:
106
- parts.append(f"clothing items such as {', '.join(items)}")
107
- clothing_sentence = "The clothing observed includes " + " with ".join(parts) + "." if parts else "Clothing is present but not clearly distinguishable."
108
-
109
- # 6. ์ตœ์ข… ์„ค๋ช… ์ƒ์„ฑ
110
- final_description = generate_15_sentences(caption, num_faces, age_summary, clothing_sentence)
111
- return "\n".join([f"{i+1}. {s}" for i, s in enumerate(final_description)])
112
-
113
- # ====== Gradio ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ • ======
114
- interface = gr.Interface(
115
- fn=analyze_uploaded_image,
116
- inputs=gr.Image(type="pil", label="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”"),
117
- outputs=gr.Textbox(label="15๋ฌธ์žฅ ์ด๋ฏธ์ง€ ์„ค๋ช…"),
118
- title="๐Ÿง  ์ด๋ฏธ์ง€ ์ธ์‹ ์„ค๋ช…๊ธฐ (BLIP + DeepFace)",
119
- description="์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด ์‚ฌ๋žŒ ์ˆ˜, ์„ฑ๋ณ„, ์—ฐ๋ น๋Œ€, ์˜ท, ๋ถ„์œ„๊ธฐ ๋“ฑ์„ 15๊ฐœ์˜ ๋ฌธ์žฅ์œผ๋กœ ์„ค๋ช…ํ•ฉ๋‹ˆ๋‹ค."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  )
121
 
122
- # ====== ์•ฑ ์‹คํ–‰ ======
123
- interface.launch()
 
6
  from deepface import DeepFace
7
  import gradio as gr
8
 
9
+ # Load BLIP model
10
  processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
11
  model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
 
12
 
13
+ # Clothing extractor
14
  def extract_clothing(text):
15
  colors = ['red', 'blue', 'green', 'black', 'white', 'yellow', 'brown', 'gray', 'pink', 'orange']
16
  patterns = ['striped', 'checkered', 'plaid', 'polka-dot', 'solid', 'patterned', 'floral']
17
  items = ['jacket', 'coat', 'dress', 'shirt', 't-shirt', 'jeans', 'pants', 'shorts',
18
  'suit', 'sneakers', 'hat', 'scarf', 'uniform']
19
+
20
  found_colors = [c for c in colors if c in text.lower()]
21
+ found_patterns = [p for p in patterns if c in text.lower()]
22
+ found_items = [i for i in items if i in text.lower()]
23
+
24
  return found_colors, found_patterns, found_items
25
 
26
+ # Main function
27
+ def analyze_image(image_pil):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  image_pil = image_pil.convert("RGB")
29
  image_np = np.array(image_pil)
30
 
31
+ # Caption generation
32
  inputs = processor(image_pil, return_tensors="pt")
33
  out = model.generate(**inputs)
34
  caption = processor.decode(out[0], skip_special_tokens=True)
35
 
36
+ # Face detection
37
+ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
38
  gray = cv2.cvtColor(image_np, cv2.COLOR_RGB2GRAY)
39
  faces = face_cascade.detectMultiScale(gray, 1.1, 4)
40
 
 
41
  face_infos = []
42
  for (x, y, w, h) in faces:
43
  face_crop = image_np[y:y+h, x:x+w]
44
  try:
45
+ analysis = DeepFace.analyze(face_crop, actions=['age', 'gender', 'emotion'], enforce_detection=False)
46
  age = analysis[0]['age']
47
  gender = analysis[0]['gender']
48
+ emotion = analysis[0]['dominant_emotion']
49
+
50
  if age < 13:
51
  age_group = "child"
52
  elif age < 20:
 
55
  age_group = "adult"
56
  else:
57
  age_group = "senior"
58
+
59
  face_infos.append({
60
+ "age": age,
61
  "gender": gender,
62
+ "age_group": age_group,
63
+ "emotion": emotion
64
  })
65
+ except Exception:
66
  continue
67
 
68
+ # Summary stats
69
  num_faces = len(face_infos)
70
+ gender_counts = {"Man": 0, "Woman": 0}
 
71
  age_summary = {}
72
+ emotion_summary = {}
73
+
74
  for face in face_infos:
75
+ gender = face['gender']
76
+ age_group = face['age_group']
77
+ emotion = face['emotion']
78
 
79
+ gender_counts[gender] += 1
80
+ age_summary[age_group] = age_summary.get(age_group, 0) + 1
81
+ emotion_summary[emotion] = emotion_summary.get(emotion, 0) + 1
82
+
83
+ # Clothing info from caption
84
  colors, patterns, items = extract_clothing(caption)
85
+
86
+ # Generate 15 sentences
87
+ sentences = []
88
+ sentences.append(f"According to the BLIP model, the scene can be described as: \"{caption}\".")
89
+ sentences.append(f"The image contains {num_faces} visible face(s) detected by OpenCV.")
90
+
91
+ gender_desc = []
92
+ if gender_counts["Man"] > 0:
93
+ gender_desc.append(f"{gender_counts['Man']} male(s)")
94
+ if gender_counts["Woman"] > 0:
95
+ gender_desc.append(f"{gender_counts['Woman']} female(s)")
96
+ if gender_desc:
97
+ sentences.append("Gender distribution shows " + " and ".join(gender_desc) + ".")
98
+ else:
99
+ sentences.append("Gender analysis was inconclusive.")
100
+
101
+ if age_summary:
102
+ age_list = [f"{count} {group}(s)" for group, count in age_summary.items()]
103
+ sentences.append("Age groups represented include " + ", ".join(age_list) + ".")
104
+ else:
105
+ sentences.append("No conclusive age groupings found.")
106
+
107
+ if emotion_summary:
108
+ emo_list = [f"{count} showing {emo}" for emo, count in emotion_summary.items()]
109
+ sentences.append("Facial expressions include " + ", ".join(emo_list) + ".")
110
+ else:
111
+ sentences.append("Emotion detection yielded limited results.")
112
+
113
+ if colors or patterns or items:
114
+ cloth_parts = []
115
+ if colors:
116
+ cloth_parts.append(f"colors like {', '.join(colors)}")
117
+ if patterns:
118
+ cloth_parts.append(f"patterns such as {', '.join(patterns)}")
119
+ if items:
120
+ cloth_parts.append(f"items like {', '.join(items)}")
121
+ sentences.append("The clothing observed includes " + " and ".join(cloth_parts) + ".")
122
+ else:
123
+ sentences.append("Clothing details were not clearly identified.")
124
+
125
+ if num_faces > 0:
126
+ sentences.append("Faces are distributed naturally across the image.")
127
+ sentences.append("Differences in face size suggest variation in distance from the camera.")
128
+ sentences.append("Hairstyles appear diverse, from short to tied-back styles.")
129
+ sentences.append("Lighting emphasizes certain facial features and expressions.")
130
+ sentences.append("Some individuals face the camera while others look away.")
131
+ sentences.append("Mood diversity is reflected in the variety of facial expressions.")
132
+ sentences.append("The clothing style appears casual or semi-formal.")
133
+ else:
134
+ sentences.append("No visible faces were found to analyze further visual characteristics.")
135
+
136
+ sentences.append("Overall, the image integrates facial, emotional, and clothing features into a cohesive scene.")
137
+
138
+ return "\n".join([f"{i+1}. {s}" for i, s in enumerate(sentences)])
139
+
140
+ # Gradio Interface
141
+ demo = gr.Interface(
142
+ fn=analyze_image,
143
+ inputs=gr.Image(type="pil"),
144
+ outputs=gr.Textbox(label="๐Ÿ“ 15-Sentence Detailed Description"),
145
+ title="๐Ÿ–ผ๏ธ Image Analysis with BLIP + DeepFace",
146
+ description="Upload an image to get a detailed 15-sentence description of facial features, age, gender, clothing, and more."
147
  )
148
 
149
+ demo.launch()