|
|
|
|
|
import os |
|
import numpy as np |
|
from sklearn.model_selection import train_test_split |
|
from tensorflow.keras.utils import to_categorical |
|
import pickle |
|
|
|
def load_sequences(preprocessed_dir='preprocessed_sequences'): |
|
""" |
|
Loads preprocessed sequences and their labels. |
|
|
|
Args: |
|
preprocessed_dir (str): Directory containing preprocessed sequences. |
|
|
|
Returns: |
|
tuple: Lists of sequences and labels, label mapping dictionary. |
|
""" |
|
X = [] |
|
y = [] |
|
label_map = {} |
|
classes = sorted(os.listdir(preprocessed_dir)) |
|
|
|
for idx, cls in enumerate(classes): |
|
label_map[cls] = idx |
|
cls_path = os.path.join(preprocessed_dir, cls) |
|
if not os.path.isdir(cls_path): |
|
continue |
|
sequence_files = [f for f in os.listdir(cls_path) if f.endswith('.npy')] |
|
for seq_file in sequence_files: |
|
seq_path = os.path.join(cls_path, seq_file) |
|
sequence = np.load(seq_path) |
|
X.append(sequence) |
|
y.append(idx) |
|
|
|
|
|
y = np.array(y) |
|
y = to_categorical(y, num_classes=len(label_map)) |
|
|
|
return X, y, label_map |
|
|
|
def pad_sequences_fixed(X, max_seq_length): |
|
""" |
|
Pads or truncates sequences to a fixed length. |
|
|
|
Args: |
|
X (list of numpy.ndarray): List of sequences with shape (frames, height, width, channels). |
|
max_seq_length (int): Desired sequence length. |
|
|
|
Returns: |
|
numpy.ndarray: Padded/truncated sequences. |
|
""" |
|
padded_X = [] |
|
for seq in X: |
|
if seq.shape[0] < max_seq_length: |
|
pad_width = max_seq_length - seq.shape[0] |
|
padding = np.zeros((pad_width, *seq.shape[1:]), dtype=seq.dtype) |
|
padded_seq = np.concatenate((seq, padding), axis=0) |
|
else: |
|
padded_seq = seq[:max_seq_length] |
|
padded_X.append(padded_seq) |
|
return np.array(padded_X) |
|
|
|
def save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl'): |
|
""" |
|
Saves the dataset into a pickle file. |
|
|
|
Args: |
|
X_train, X_test, y_train, y_test: Split data. |
|
label_map (dict): Mapping from class names to indices. |
|
output_path (str): Path to save the pickle file. |
|
""" |
|
with open(output_path, 'wb') as f: |
|
pickle.dump({ |
|
'X_train': X_train, |
|
'X_test': X_test, |
|
'y_train': y_train, |
|
'y_test': y_test, |
|
'label_map': label_map |
|
}, f) |
|
print(f"Dataset saved to {output_path}.") |
|
|
|
def load_dataset_pickle(pickle_path='dataset_sequences.pkl'): |
|
""" |
|
Loads the dataset from a pickle file. |
|
|
|
Args: |
|
pickle_path (str): Path to the pickle file. |
|
|
|
Returns: |
|
tuple: Split data and label mapping. |
|
""" |
|
with open(pickle_path, 'rb') as f: |
|
data = pickle.load(f) |
|
return data['X_train'], data['X_test'], data['y_train'], data['y_test'], data['label_map'] |
|
|
|
if __name__ == "__main__": |
|
|
|
X, y, label_map = load_sequences(preprocessed_dir='preprocessed_sequences') |
|
print(f"Total samples: {len(X)}") |
|
|
|
|
|
max_seq_length = max([seq.shape[0] for seq in X]) |
|
print(f"Maximum sequence length: {max_seq_length}") |
|
|
|
|
|
X_padded = pad_sequences_fixed(X, max_seq_length) |
|
print(f"Padded sequences shape: {X_padded.shape}") |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42) |
|
print(f"Training samples: {X_train.shape[0]}") |
|
print(f"Testing samples: {X_test.shape[0]}") |
|
|
|
|
|
save_dataset(X_train, X_test, y_train, y_test, label_map, output_path='dataset_sequences.pkl') |
|
|