MLFPA / Raf_Clustering.py
Jonas Leeb
Clustering
7db94f3
raw
history blame
4.02 kB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import load_npz
import time
from multiprocessing import Pool
embed_type = 'SBERT' # Change this to 'MLFPA' or 'BERT' as needed
#if no parquet create it
try:
embeddings_df = pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')
except:
# Load MLFPA_project-main\BERT embeddings\bert_embedding.npz
embeddings_df = np.load('BERT embeddings/sbert_embedding.npz')['sbert_embedding']#np.load(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embedding.npz')
# print(embeddings_df.files) # Check the keys in the .npz file
# embeddings_df = [f'{embed_type.lower().replace("-", "")}_embedding']
print(embeddings_df.shape) # Check the shape of the embeddings
#print data type
print(type(embeddings_df)) # Check the type of the embeddings
#change to pandas dataframe
embeddings_df = pd.DataFrame(embeddings_df)
#save as parquet
embeddings_df.to_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#to_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')
#load parquet
embeddings_df = pd.read_parquet(f'BERT embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')#pd.read_parquet(f'{embed_type} embeddings/{embed_type.lower().replace("-", "")}_embeddings.parquet')
#do the clustering lmao
def scale_and_pca(embeddings_df):
# Standardize the data
scaler = StandardScaler()
embeddings_scaled = scaler.fit_transform(embeddings_df)
# Perform PCA to reduce dimensionality
pca = PCA(n_components=3)
embeddings_pca = pca.fit_transform(embeddings_scaled)
return embeddings_pca
embeddings_pca = scale_and_pca(embeddings_df)
#remove embeddings_df from memory
del embeddings_df
# Create a 3D scatter plot of the PCA results
def plot_3d_scatter(embeddings_pca):
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(embeddings_pca[:, 0], embeddings_pca[:, 1], embeddings_pca[:, 2], s=1)
ax.set_xlabel('PC 1')
ax.set_ylabel('PC 2')
ax.set_zlabel('PC 3')
plt.title('3D PCA of BERT Embeddings')
plt.show()
# plot_3d_scatter(embeddings_pca)
# def compute_silhouette(n_clusters, data):
# kmeans = KMeans(n_clusters=n_clusters, random_state=420)
# labels = kmeans.fit_predict(data)
# silhouette_avg = silhouette_score(data, labels)
# print(f"For n_clusters = {n_clusters}, the silhouette score is: {silhouette_avg}")
# return silhouette_avg
# silhouette_scores = []
# for i in range(2, 10):
# start_time = time.time()
# silhouette_scores.append(compute_silhouette(i, embeddings_pca))
# end_time = time.time()
# print(f"Time taken for n_clusters = {i}: {end_time - start_time} seconds")
# # Plot silhouette scores
# plt.figure(figsize=(10, 6))
# plt.plot(range(2, 10), silhouette_scores, marker='o')
# plt.title('Silhouette Scores for Different Cluster Sizes')
# plt.xlabel('Number of Clusters')
# plt.ylabel('Silhouette Score')
# plt.xticks(range(2, 10))
# plt.grid()
# plt.show()
# # Save silhouette scores to CSV
# silhouette_df = pd.DataFrame({'n_clusters': range(2, 10), 'silhouette_score': silhouette_scores})
# silhouette_df.to_csv('MLFPA_project-main/Raf_scores/silhouette_scores.csv', index=False)
#save the the cluster labels for n_clusters = 5
def save_cluster_labels(n_clusters, data):
kmeans = KMeans(n_clusters=n_clusters, random_state=420)
labels = kmeans.fit_predict(data)
labels_df = pd.DataFrame(labels, columns=['cluster_label'])
labels_df.to_csv(f'raf_clusters/cluster_labels_sbert.csv', index=False)
return labels_df
save_cluster_labels(5, embeddings_pca)
# plot_3d_scatter(embeddings_pca)