|
import streamlit as st |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
from mpl_toolkits.mplot3d import Axes3D |
|
import umap |
|
import pandas as pd |
|
from word2vec import * |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
|
|
def make_3d_plot(new_3d_vectors): |
|
""" |
|
Turn DataFrame of 3D vectors into a 3D plot |
|
DataFrame structure: ['word', 'cosine_sim', '3d_vector'] |
|
""" |
|
fig = plt.figure() |
|
ax = fig.add_subplot(projection='3d') |
|
|
|
plt.ion() |
|
|
|
|
|
labels = new_3d_vectors['word'] |
|
x = new_3d_vectors['3d_vector'].apply(lambda v: v[0]) |
|
y = new_3d_vectors['3d_vector'].apply(lambda v: v[1]) |
|
z = new_3d_vectors['3d_vector'].apply(lambda v: v[2]) |
|
|
|
|
|
ax.scatter(x, y, z) |
|
|
|
|
|
for i, label in enumerate(labels): |
|
ax.text(x[i], y[i], z[i], label) |
|
|
|
|
|
ax.set_xlabel('X') |
|
ax.set_ylabel('Y') |
|
ax.set_zlabel('Z') |
|
ax.set_title('3D plot of word vectors') |
|
|
|
return fig |
|
|
|
|
|
import plotly.express as px |
|
|
|
|
|
def make_3d_plot2(df): |
|
""" |
|
Turn DataFrame of 3D vectors into a 3D plot using plotly |
|
DataFrame structure: ['word', 'cosine_sim', '3d_vector'] |
|
""" |
|
vectors = df['3d_vector'].tolist() |
|
fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word']) |
|
return fig |
|
|
|
|
|
def make_3d_plot3(vectors_list, word, time_slice_model): |
|
""" |
|
Turn list of 100D vectors into a 3D plot using UMAP and Plotly. |
|
List structure: [(word, model_name, vector, cosine_sim)] |
|
""" |
|
|
|
model = load_word2vec_model(f'models/{time_slice_model}.model') |
|
|
|
|
|
umap_model = umap.UMAP(n_components=3) |
|
umap_model.fit(model.wv.vectors) |
|
|
|
|
|
transformed_vectors = umap_model.transform(model.wv.vectors) |
|
|
|
|
|
|
|
df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z']) |
|
|
|
|
|
df['word'] = model.wv.index_to_key |
|
|
|
|
|
word_list = [v[0] for v in vectors_list] |
|
cosine_sim_list = [v[3] for v in vectors_list] |
|
|
|
|
|
df = df[df['word'].isin(word_list)] |
|
df['cosine_sim'] = cosine_sim_list |
|
|
|
|
|
fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds') |
|
fig.update_traces(marker=dict(size=5)) |
|
fig.update_layout(title=f'3D plot of nearest neighbours to {word}') |
|
|
|
return fig, df |
|
|
|
|
|
|
|
def make_3d_plot4(vectors_list, word, time_slice_model): |
|
""" |
|
Turn list of 100D vectors into a 3D plot using UMAP and Plotly. |
|
List structure: [(word, model_name, vector, cosine_sim)] |
|
""" |
|
|
|
model = load_word2vec_model(f'models/{time_slice_model}.model') |
|
model_dict = model_dictionary(model) |
|
|
|
|
|
|
|
all_vector_names = list(model_dict.keys()) |
|
all_vectors = list(model_dict.values()) |
|
|
|
|
|
|
|
scaler = StandardScaler() |
|
vectors_scaled = scaler.fit_transform(all_vectors) |
|
|
|
|
|
umap_model = umap.UMAP(n_components=3) |
|
umap_result = umap_model.fit_transform(vectors_scaled) |
|
|
|
|
|
|
|
result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))] |
|
|
|
|
|
|
|
result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]] |
|
result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names] |
|
|
|
|
|
|
|
df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim']) |
|
|
|
|
|
df = df.sort_values(by='cosine_sim', ascending=False) |
|
|
|
x = df['3d_vector'].apply(lambda v: v[0]) |
|
y = df['3d_vector'].apply(lambda v: v[1]) |
|
z = df['3d_vector'].apply(lambda v: v[2]) |
|
|
|
|
|
|
|
fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds') |
|
fig.update_traces(marker=dict(size=5)) |
|
fig.update_layout(title=f'3D plot of nearest neighbours to {word}') |
|
|
|
return fig, df |
|
|