Eye-Movement-Recognition / dataset_preparation_sequences.py
shayan5422's picture
Upload 12 files
1d4559c verified
# dataset_preparation_sequences.py
import os
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import pickle
def load_sequences(preprocessed_dir='preprocessed_sequences'):
"""
Loads preprocessed sequences and their labels.
Args:
preprocessed_dir (str): Directory containing preprocessed sequences.
Returns:
tuple: Lists of sequences and labels, label mapping dictionary.
"""
X = []
y = []
label_map = {}
classes = sorted(os.listdir(preprocessed_dir))
for idx, cls in enumerate(classes):
label_map[cls] = idx
cls_path = os.path.join(preprocessed_dir, cls)
if not os.path.isdir(cls_path):
continue
sequence_files = [f for f in os.listdir(cls_path) if f.endswith('.npy')]
for seq_file in sequence_files:
seq_path = os.path.join(cls_path, seq_file)
sequence = np.load(seq_path)
X.append(sequence)
y.append(idx)
# X remains a list of numpy arrays with varying shapes
y = np.array(y)
y = to_categorical(y, num_classes=len(label_map))
return X, y, label_map
def pad_sequences_fixed(X, max_seq_length):
"""
Pads or truncates sequences to a fixed length.
Args:
X (list of numpy.ndarray): List of sequences with shape (frames, height, width, channels).
max_seq_length (int): Desired sequence length.
Returns:
numpy.ndarray: Padded/truncated sequences.
"""
padded_X = []
for seq in X:
if seq.shape[0] < max_seq_length:
pad_width = max_seq_length - seq.shape[0]
padding = np.zeros((pad_width, *seq.shape[1:]), dtype=seq.dtype)
padded_seq = np.concatenate((seq, padding), axis=0)
else:
padded_seq = seq[:max_seq_length]
padded_X.append(padded_seq)
return np.array(padded_X)
def save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl'):
"""
Saves the dataset into a pickle file.
Args:
X_train, X_test, y_train, y_test: Split data.
label_map (dict): Mapping from class names to indices.
output_path (str): Path to save the pickle file.
"""
with open(output_path, 'wb') as f:
pickle.dump({
'X_train': X_train,
'X_test': X_test,
'y_train': y_train,
'y_test': y_test,
'label_map': label_map
}, f)
print(f"Dataset saved to {output_path}.")
def load_dataset_pickle(pickle_path='dataset_sequences.pkl'):
"""
Loads the dataset from a pickle file.
Args:
pickle_path (str): Path to the pickle file.
Returns:
tuple: Split data and label mapping.
"""
with open(pickle_path, 'rb') as f:
data = pickle.load(f)
return data['X_train'], data['X_test'], data['y_train'], data['y_test'], data['label_map']
if __name__ == "__main__":
# Load sequences
X, y, label_map = load_sequences(preprocessed_dir='preprocessed_sequences')
print(f"Total samples: {len(X)}")
# Find the maximum sequence length for padding
max_seq_length = max([seq.shape[0] for seq in X])
print(f"Maximum sequence length: {max_seq_length}")
# Pad sequences to have the same length
X_padded = pad_sequences_fixed(X, max_seq_length)
print(f"Padded sequences shape: {X_padded.shape}")
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")
# Save the dataset
save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl')