Eye-Movement-Recognition / dataset_preparation_sequences.py

Upload 12 files

1d4559c verified 10 months ago

3.85 kB

	# dataset_preparation_sequences.py

	import os
	import numpy as np
	from sklearn.model_selection import train_test_split
	from tensorflow.keras.utils import to_categorical
	import pickle

	def load_sequences(preprocessed_dir='preprocessed_sequences'):
	"""
	Loads preprocessed sequences and their labels.

	Args:
	preprocessed_dir (str): Directory containing preprocessed sequences.

	Returns:
	tuple: Lists of sequences and labels, label mapping dictionary.
	"""
	X = []
	y = []
	label_map = {}
	classes = sorted(os.listdir(preprocessed_dir))

	for idx, cls in enumerate(classes):
	label_map[cls] = idx
	cls_path = os.path.join(preprocessed_dir, cls)
	if not os.path.isdir(cls_path):
	continue
	sequence_files = [f for f in os.listdir(cls_path) if f.endswith('.npy')]
	for seq_file in sequence_files:
	seq_path = os.path.join(cls_path, seq_file)
	sequence = np.load(seq_path)
	X.append(sequence)
	y.append(idx)

	# X remains a list of numpy arrays with varying shapes
	y = np.array(y)
	y = to_categorical(y, num_classes=len(label_map))

	return X, y, label_map

	def pad_sequences_fixed(X, max_seq_length):
	"""
	Pads or truncates sequences to a fixed length.

	Args:
	X (list of numpy.ndarray): List of sequences with shape (frames, height, width, channels).
	max_seq_length (int): Desired sequence length.

	Returns:
	numpy.ndarray: Padded/truncated sequences.
	"""
	padded_X = []
	for seq in X:
	if seq.shape[0] < max_seq_length:
	pad_width = max_seq_length - seq.shape[0]
	padding = np.zeros((pad_width, *seq.shape[1:]), dtype=seq.dtype)
	padded_seq = np.concatenate((seq, padding), axis=0)
	else:
	padded_seq = seq[:max_seq_length]
	padded_X.append(padded_seq)
	return np.array(padded_X)

	def save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl'):
	"""
	Saves the dataset into a pickle file.

	Args:
	X_train, X_test, y_train, y_test: Split data.
	label_map (dict): Mapping from class names to indices.
	output_path (str): Path to save the pickle file.
	"""
	with open(output_path, 'wb') as f:
	pickle.dump({
	'X_train': X_train,
	'X_test': X_test,
	'y_train': y_train,
	'y_test': y_test,
	'label_map': label_map
	}, f)
	print(f"Dataset saved to {output_path}.")

	def load_dataset_pickle(pickle_path='dataset_sequences.pkl'):
	"""
	Loads the dataset from a pickle file.

	Args:
	pickle_path (str): Path to the pickle file.

	Returns:
	tuple: Split data and label mapping.
	"""
	with open(pickle_path, 'rb') as f:
	data = pickle.load(f)
	return data['X_train'], data['X_test'], data['y_train'], data['y_test'], data['label_map']

	if __name__ == "__main__":
	# Load sequences
	X, y, label_map = load_sequences(preprocessed_dir='preprocessed_sequences')
	print(f"Total samples: {len(X)}")

	# Find the maximum sequence length for padding
	max_seq_length = max([seq.shape[0] for seq in X])
	print(f"Maximum sequence length: {max_seq_length}")

	# Pad sequences to have the same length
	X_padded = pad_sequences_fixed(X, max_seq_length)
	print(f"Padded sequences shape: {X_padded.shape}")

	# Split into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
	print(f"Training samples: {X_train.shape[0]}")
	print(f"Testing samples: {X_test.shape[0]}")

	# Save the dataset
	save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl')