chore: resample-prep
Browse filesOpenCV's Haar Cascades for face detection + re-sampling roughly 20% of original evalset
- scripts/resample_evalset.py +77 -0
scripts/resample_evalset.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import random
|
3 |
+
import cv2
|
4 |
+
from datetime import datetime
|
5 |
+
import logging
|
6 |
+
|
7 |
+
# Set up logging configuration
|
8 |
+
log_file = "sample_images.log"
|
9 |
+
logging.basicConfig(filename=log_file, level=logging.INFO,
|
10 |
+
format='%(asctime)s - %(levelname)s - %(message)s')
|
11 |
+
|
12 |
+
def detect_faces(image_path):
|
13 |
+
# Load the pre-trained Haar Cascade model for face detection
|
14 |
+
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
|
15 |
+
|
16 |
+
# Read the image in grayscale
|
17 |
+
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
|
18 |
+
if image is None:
|
19 |
+
return False
|
20 |
+
|
21 |
+
# Detect faces in the image
|
22 |
+
faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
|
23 |
+
|
24 |
+
# Return True if at least one face is detected
|
25 |
+
return len(faces) > 0
|
26 |
+
|
27 |
+
def sample_images(input_folder, output_folder, sample_rate=0.2):
|
28 |
+
# Ensure the output folder exists
|
29 |
+
if not os.path.exists(output_folder):
|
30 |
+
os.makedirs(output_folder)
|
31 |
+
|
32 |
+
# Initialize counters and start time
|
33 |
+
total_files = 0
|
34 |
+
sampled_files = 0
|
35 |
+
start_time = datetime.now()
|
36 |
+
|
37 |
+
# Walk through the input folder structure
|
38 |
+
for root, dirs, files in os.walk(input_folder):
|
39 |
+
relative_path = os.path.relpath(root, input_folder)
|
40 |
+
output_subfolder = os.path.join(output_folder, relative_path)
|
41 |
+
|
42 |
+
if not os.path.exists(output_subfolder):
|
43 |
+
os.makedirs(output_subfolder)
|
44 |
+
|
45 |
+
total_files += len(files)
|
46 |
+
|
47 |
+
# Sample files in this directory
|
48 |
+
sampled_files_this_batch = []
|
49 |
+
for file in files:
|
50 |
+
if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
|
51 |
+
input_file_path = os.path.join(root, file)
|
52 |
+
if detect_faces(input_file_path):
|
53 |
+
sampled_files_this_batch.append(file)
|
54 |
+
|
55 |
+
sampled_files += len(sampled_files_this_batch)
|
56 |
+
|
57 |
+
for file in files:
|
58 |
+
if file in sampled_files_this_batch:
|
59 |
+
input_file_path = os.path.join(root, file)
|
60 |
+
output_file_path = os.path.join(output_subfolder, file)
|
61 |
+
os.link(input_file_path, output_file_path)
|
62 |
+
|
63 |
+
# Log the action
|
64 |
+
logging.info(f"Sampled and copied {input_file_path} to {output_file_path}")
|
65 |
+
|
66 |
+
elapsed_time = datetime.now() - start_time
|
67 |
+
print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}")
|
68 |
+
|
69 |
+
end_time = datetime.now()
|
70 |
+
total_time = end_time - start_time
|
71 |
+
logging.info(f"Total time taken: {total_time}")
|
72 |
+
logging.info(f"Sampled {sampled_files} out of {total_files} files.")
|
73 |
+
|
74 |
+
if __name__ == "__main__":
|
75 |
+
input_folder = "EvalSet"
|
76 |
+
output_folder = "resampledEvalSet"
|
77 |
+
sample_images(input_folder, output_folder)
|