Segizu commited on
Commit
b6a67be
·
1 Parent(s): 288a128
Files changed (1) hide show
  1. app.py +56 -25
app.py CHANGED
@@ -5,16 +5,19 @@ from deepface import DeepFace
5
  from datasets import load_dataset
6
  import os
7
  import pickle
 
 
8
  from pathlib import Path
9
  import gc
10
  import requests
11
- from io import BytesIO
12
 
13
- # 📁 Carpeta para guardar cada embedding
14
- EMBEDDINGS_DIR = Path("embeddings")
15
- EMBEDDINGS_DIR.mkdir(exist_ok=True)
 
 
16
 
17
- # ✅ Cargar dataset CSV
18
  dataset = load_dataset(
19
  "csv",
20
  data_files="metadata.csv",
@@ -27,17 +30,13 @@ print("✅ Validación post-carga")
27
  print(dataset[0])
28
  print("Columnas:", dataset.column_names)
29
 
30
- # 🔄 Preprocesamiento para DeepFace
31
  def preprocess_image(img: Image.Image) -> np.ndarray:
32
  img_rgb = img.convert("RGB")
33
  img_resized = img_rgb.resize((160, 160), Image.Resampling.LANCZOS)
34
  return np.array(img_resized)
35
 
36
- # 🔐 Header si el dataset es privado
37
- HF_TOKEN = os.getenv("HF_TOKEN")
38
- headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
39
-
40
- # 📦 Construir base (embedding por archivo)
41
  def build_database():
42
  print("🔄 Generando embeddings...")
43
  batch_size = 10
@@ -50,15 +49,25 @@ def build_database():
50
  item = {"image": batch["image"][j]}
51
  image_url = item["image"]
52
 
53
- # Validar
54
  if not isinstance(image_url, str) or not image_url.startswith("http") or image_url.strip().lower() == "image":
55
  print(f"⚠️ Saltando {i + j} - URL inválida: {image_url}")
56
  continue
57
 
58
  name = f"image_{i + j}"
59
- emb_path = EMBEDDINGS_DIR / f"{name}.pkl"
60
- if emb_path.exists():
61
- continue # Ya existe
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  try:
64
  response = requests.get(image_url, headers=headers, timeout=10)
@@ -72,11 +81,20 @@ def build_database():
72
  enforce_detection=False
73
  )[0]["embedding"]
74
 
75
- # Guardar como archivo individual
76
- with open(emb_path, "wb") as f:
77
  pickle.dump({"name": name, "img": img, "embedding": embedding}, f)
78
 
79
- print(f"✅ Guardado: {name}")
 
 
 
 
 
 
 
 
 
80
  del img_processed
81
  gc.collect()
82
 
@@ -84,7 +102,7 @@ def build_database():
84
  print(f"❌ Error en {name}: {e}")
85
  continue
86
 
87
- # 🔍 Buscar similitudes
88
  def find_similar_faces(uploaded_image: Image.Image):
89
  try:
90
  img_processed = preprocess_image(uploaded_image)
@@ -100,10 +118,24 @@ def find_similar_faces(uploaded_image: Image.Image):
100
 
101
  similarities = []
102
 
103
- for emb_file in EMBEDDINGS_DIR.glob("*.pkl"):
 
 
 
 
 
 
 
 
 
 
104
  try:
105
- with open(emb_file, "rb") as f:
106
- record = pickle.load(f)
 
 
 
 
107
 
108
  name = record["name"]
109
  img = record["img"]
@@ -114,17 +146,16 @@ def find_similar_faces(uploaded_image: Image.Image):
114
  similarities.append((sim_score, name, np.array(img)))
115
 
116
  except Exception as e:
117
- print(f"⚠ Error leyendo {emb_file}: {e}")
118
  continue
119
 
120
  similarities.sort(reverse=True)
121
  top = similarities[:5]
122
-
123
  gallery = [(img, f"{name} - Similitud: {sim:.2f}") for sim, name, img in top]
124
  summary = "\n".join([f"{name} - Similitud: {sim:.2f}" for sim, name, _ in top])
125
  return gallery, summary
126
 
127
- # 🚀 Ejecutar al inicio
128
  print("🚀 Iniciando app...")
129
  build_database()
130
 
 
5
  from datasets import load_dataset
6
  import os
7
  import pickle
8
+ from io import BytesIO
9
+ from huggingface_hub import upload_file, hf_hub_download
10
  from pathlib import Path
11
  import gc
12
  import requests
 
13
 
14
+ # 📁 Parámetros
15
+ DATASET_ID = "Segizu/facial-recognition"
16
+ EMBEDDINGS_SUBFOLDER = "embeddings"
17
+ HF_TOKEN = os.getenv("HF_TOKEN")
18
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"} if HF_TOKEN else {}
19
 
20
+ # ✅ Cargar CSV desde el dataset
21
  dataset = load_dataset(
22
  "csv",
23
  data_files="metadata.csv",
 
30
  print(dataset[0])
31
  print("Columnas:", dataset.column_names)
32
 
33
+ # 🔄 Preprocesamiento
34
  def preprocess_image(img: Image.Image) -> np.ndarray:
35
  img_rgb = img.convert("RGB")
36
  img_resized = img_rgb.resize((160, 160), Image.Resampling.LANCZOS)
37
  return np.array(img_resized)
38
 
39
+ # 📦 Generar y subir embeddings
 
 
 
 
40
  def build_database():
41
  print("🔄 Generando embeddings...")
42
  batch_size = 10
 
49
  item = {"image": batch["image"][j]}
50
  image_url = item["image"]
51
 
 
52
  if not isinstance(image_url, str) or not image_url.startswith("http") or image_url.strip().lower() == "image":
53
  print(f"⚠️ Saltando {i + j} - URL inválida: {image_url}")
54
  continue
55
 
56
  name = f"image_{i + j}"
57
+ filename = f"{name}.pkl"
58
+
59
+ # Verificar si ya está subido
60
+ try:
61
+ hf_hub_download(
62
+ repo_id=DATASET_ID,
63
+ repo_type="dataset",
64
+ filename=f"{EMBEDDINGS_SUBFOLDER}/{filename}",
65
+ token=HF_TOKEN
66
+ )
67
+ print(f"⏩ Ya existe remoto: {filename}")
68
+ continue
69
+ except:
70
+ pass
71
 
72
  try:
73
  response = requests.get(image_url, headers=headers, timeout=10)
 
81
  enforce_detection=False
82
  )[0]["embedding"]
83
 
84
+ # Guardar temporal y subir
85
+ with open(filename, "wb") as f:
86
  pickle.dump({"name": name, "img": img, "embedding": embedding}, f)
87
 
88
+ upload_file(
89
+ path_or_fileobj=filename,
90
+ path_in_repo=f"{EMBEDDINGS_SUBFOLDER}/{filename}",
91
+ repo_id=DATASET_ID,
92
+ repo_type="dataset",
93
+ token=HF_TOKEN
94
+ )
95
+ os.remove(filename)
96
+ print(f"✅ Subido: {filename}")
97
+
98
  del img_processed
99
  gc.collect()
100
 
 
102
  print(f"❌ Error en {name}: {e}")
103
  continue
104
 
105
+ # 🔍 Buscar similitudes desde archivos remotos
106
  def find_similar_faces(uploaded_image: Image.Image):
107
  try:
108
  img_processed = preprocess_image(uploaded_image)
 
118
 
119
  similarities = []
120
 
121
+ try:
122
+ # Obtener lista de archivos remotos
123
+ from huggingface_hub import list_repo_files
124
+ embedding_files = [
125
+ f for f in list_repo_files(DATASET_ID, repo_type="dataset", token=HF_TOKEN)
126
+ if f.startswith(f"{EMBEDDINGS_SUBFOLDER}/") and f.endswith(".pkl")
127
+ ]
128
+ except Exception as e:
129
+ return [], f"⚠ Error obteniendo archivos del dataset: {str(e)}"
130
+
131
+ for file_path in embedding_files:
132
  try:
133
+ file_bytes = requests.get(
134
+ f"https://huggingface.co/datasets/{DATASET_ID}/resolve/main/{file_path}",
135
+ headers=headers,
136
+ timeout=10
137
+ ).content
138
+ record = pickle.loads(file_bytes)
139
 
140
  name = record["name"]
141
  img = record["img"]
 
146
  similarities.append((sim_score, name, np.array(img)))
147
 
148
  except Exception as e:
149
+ print(f"⚠ Error con {file_path}: {e}")
150
  continue
151
 
152
  similarities.sort(reverse=True)
153
  top = similarities[:5]
 
154
  gallery = [(img, f"{name} - Similitud: {sim:.2f}") for sim, name, img in top]
155
  summary = "\n".join([f"{name} - Similitud: {sim:.2f}" for sim, name, _ in top])
156
  return gallery, summary
157
 
158
+ # 🚀 Inicializar
159
  print("🚀 Iniciando app...")
160
  build_database()
161