import os import random import shutil import glob from tqdm import tqdm def sample_images_and_features(image_folder, feature_folder, sample_size, dest_image_folder, dest_feature_folder): """ Randomly samples a specified number of resized images along with their corresponding CLIP and caption features, and copies them to new folders. Args: image_folder (str): Path to the folder containing resized images. feature_folder (str): Path to the folder containing feature files. sample_size (int): Number of images to sample. dest_image_folder (str): Destination folder for sampled images. dest_feature_folder (str): Destination folder for sampled feature files. """ # Ensure destination folders exist os.makedirs(dest_image_folder, exist_ok=True) os.makedirs(dest_feature_folder, exist_ok=True) # Get all resized image file names image_files = glob.glob(os.path.join(image_folder, "resized_*.jpg")) image_files.extend(glob.glob(os.path.join(image_folder, "resized_*.png"))) image_files.extend(glob.glob(os.path.join(image_folder, "resized_*.jpeg"))) # Check if there are enough images if len(image_files) < sample_size: raise ValueError("Not enough resized images in the source folder.") # Sample a subset of image files sampled_images = random.sample(image_files, sample_size) # Copy images and corresponding feature files for image_path in tqdm(sampled_images): image_name = os.path.basename(image_path) base_name, _ = os.path.splitext(image_name) # Construct paths for CLIP and caption feature files clip_feature_path = os.path.join(feature_folder, f"{base_name}_clip.npy") caption_feature_path = os.path.join(feature_folder, f"{base_name}_caption.npy") # Copy image file shutil.copy2(image_path, dest_image_folder) # copy2 preserves metadata # Copy feature files (if they exist) if os.path.exists(clip_feature_path): shutil.copy2(clip_feature_path, dest_feature_folder) if os.path.exists(caption_feature_path): shutil.copy2(caption_feature_path, dest_feature_folder) if __name__ == "__main__": from pathlib import Path PROJECT_ROOT = Path(__file__).resolve().parent image_folder = str(PROJECT_ROOT / "data/images") feature_folder = str(PROJECT_ROOT / "data/features") sample_size = 10 dest_image_folder = str(PROJECT_ROOT / "data_temp/images") dest_feature_folder = str(PROJECT_ROOT / "data_temp/features") sample_images_and_features(image_folder, feature_folder, sample_size, dest_image_folder, dest_feature_folder)