LPX55 commited on
Commit
e174112
·
verified ·
1 Parent(s): 4c6c42a

chore: resample-prep

Browse files

OpenCV's Haar Cascades for face detection + re-sampling roughly 20% of original evalset

Files changed (1) hide show
  1. scripts/resample_evalset.py +77 -0
scripts/resample_evalset.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import random
3
+ import cv2
4
+ from datetime import datetime
5
+ import logging
6
+
7
+ # Set up logging configuration
8
+ log_file = "sample_images.log"
9
+ logging.basicConfig(filename=log_file, level=logging.INFO,
10
+ format='%(asctime)s - %(levelname)s - %(message)s')
11
+
12
+ def detect_faces(image_path):
13
+ # Load the pre-trained Haar Cascade model for face detection
14
+ face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
15
+
16
+ # Read the image in grayscale
17
+ image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
18
+ if image is None:
19
+ return False
20
+
21
+ # Detect faces in the image
22
+ faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
23
+
24
+ # Return True if at least one face is detected
25
+ return len(faces) > 0
26
+
27
+ def sample_images(input_folder, output_folder, sample_rate=0.2):
28
+ # Ensure the output folder exists
29
+ if not os.path.exists(output_folder):
30
+ os.makedirs(output_folder)
31
+
32
+ # Initialize counters and start time
33
+ total_files = 0
34
+ sampled_files = 0
35
+ start_time = datetime.now()
36
+
37
+ # Walk through the input folder structure
38
+ for root, dirs, files in os.walk(input_folder):
39
+ relative_path = os.path.relpath(root, input_folder)
40
+ output_subfolder = os.path.join(output_folder, relative_path)
41
+
42
+ if not os.path.exists(output_subfolder):
43
+ os.makedirs(output_subfolder)
44
+
45
+ total_files += len(files)
46
+
47
+ # Sample files in this directory
48
+ sampled_files_this_batch = []
49
+ for file in files:
50
+ if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
51
+ input_file_path = os.path.join(root, file)
52
+ if detect_faces(input_file_path):
53
+ sampled_files_this_batch.append(file)
54
+
55
+ sampled_files += len(sampled_files_this_batch)
56
+
57
+ for file in files:
58
+ if file in sampled_files_this_batch:
59
+ input_file_path = os.path.join(root, file)
60
+ output_file_path = os.path.join(output_subfolder, file)
61
+ os.link(input_file_path, output_file_path)
62
+
63
+ # Log the action
64
+ logging.info(f"Sampled and copied {input_file_path} to {output_file_path}")
65
+
66
+ elapsed_time = datetime.now() - start_time
67
+ print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}")
68
+
69
+ end_time = datetime.now()
70
+ total_time = end_time - start_time
71
+ logging.info(f"Total time taken: {total_time}")
72
+ logging.info(f"Sampled {sampled_files} out of {total_files} files.")
73
+
74
+ if __name__ == "__main__":
75
+ input_folder = "EvalSet"
76
+ output_folder = "resampledEvalSet"
77
+ sample_images(input_folder, output_folder)