File size: 4,722 Bytes
7b3478d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import streamlit as st
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import umap
import pandas as pd
from word2vec import *
from sklearn.preprocessing import StandardScaler


def make_3d_plot(new_3d_vectors):
    """
    Turn DataFrame of 3D vectors into a 3D plot
    DataFrame structure: ['word', 'cosine_sim', '3d_vector']
    """
    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')
    
    plt.ion()

    # Unpack vectors and labels from DataFrame
    labels = new_3d_vectors['word']
    x = new_3d_vectors['3d_vector'].apply(lambda v: v[0])
    y = new_3d_vectors['3d_vector'].apply(lambda v: v[1])
    z = new_3d_vectors['3d_vector'].apply(lambda v: v[2])

    # Plot points
    ax.scatter(x, y, z)

    # Add labels
    for i, label in enumerate(labels):
        ax.text(x[i], y[i], z[i], label)

    # Set labels and title
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title('3D plot of word vectors')

    return fig


import plotly.express as px


def make_3d_plot2(df):
    """
        Turn DataFrame of 3D vectors into a 3D plot using plotly
        DataFrame structure: ['word', 'cosine_sim', '3d_vector']
    """
    vectors = df['3d_vector'].tolist()
    fig = px.scatter_3d(df, x=[v[0] for v in vectors], y=[v[1] for v in vectors], z=[v[2] for v in vectors], text=df['word'])
    return fig


def make_3d_plot3(vectors_list, word, time_slice_model):
    """
    Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
    List structure: [(word, model_name, vector, cosine_sim)]
    """
    # Load model
    model = load_word2vec_model(f'models/{time_slice_model}.model')
    
    # Make UMAP model and fit it to the vectors
    umap_model = umap.UMAP(n_components=3)
    umap_model.fit(model.wv.vectors)
    
    # Transform the vectors to 3D
    transformed_vectors = umap_model.transform(model.wv.vectors)
    
    
    # Create DataFrame from the transformed vectors
    df = pd.DataFrame(transformed_vectors, columns=['x', 'y', 'z'])
    
    # Add word and cosine similarity to DataFrame
    df['word'] = model.wv.index_to_key
    
    # Filter the DataFrame for words in vectors_list and add cosine similarity
    word_list = [v[0] for v in vectors_list]
    cosine_sim_list = [v[3] for v in vectors_list]
    
    # Ensure that the word list and cosine similarity list are aligned properly
    df = df[df['word'].isin(word_list)]
    df['cosine_sim'] = cosine_sim_list
    
    # Create plot
    fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', color='cosine_sim', color_continuous_scale='Reds')
    fig.update_traces(marker=dict(size=5))
    fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
    
    return fig, df



def make_3d_plot4(vectors_list, word, time_slice_model):
    """
    Turn list of 100D vectors into a 3D plot using UMAP and Plotly.
    List structure: [(word, model_name, vector, cosine_sim)]
    """
    # Load model
    model = load_word2vec_model(f'models/{time_slice_model}.model')
    model_dict = model_dictionary(model)
    
    
    # Extract vectors and names from model_dict
    all_vector_names = list(model_dict.keys())
    all_vectors = list(model_dict.values())

    
    # Scale the vectors
    scaler = StandardScaler()
    vectors_scaled = scaler.fit_transform(all_vectors)
    
    # Make UMAP model and fit it to the scaled vectors
    umap_model = umap.UMAP(n_components=3)
    umap_result = umap_model.fit_transform(vectors_scaled)
    
    # Now umap_result contains the 3D representations of the vectors
    # Associate the names with the 3D representations
    result_with_names = [(all_vector_names[i], umap_result[i]) for i in range(len(all_vector_names))]
    
    
    # Only keep the vectors that are in vectors_list and their cosine similarities
    result_with_names = [r for r in result_with_names if r[0] in [v[0] for v in vectors_list]]
    result_with_names = [(r[0], r[1], [v[3] for v in vectors_list if v[0] == r[0]][0]) for r in result_with_names]
    
    
    # Create DataFrame from the transformed vectors
    df = pd.DataFrame(result_with_names, columns=['word', '3d_vector', 'cosine_sim'])
    
    # Sort dataframe by cosine_sim
    df = df.sort_values(by='cosine_sim', ascending=False)
    
    x = df['3d_vector'].apply(lambda v: v[0])
    y = df['3d_vector'].apply(lambda v: v[1])
    z = df['3d_vector'].apply(lambda v: v[2])
    
    
    # Create plot
    fig = px.scatter_3d(df, x=x, y=y, z=z, text='word', color='cosine_sim', color_continuous_scale='Reds')
    fig.update_traces(marker=dict(size=5))
    fig.update_layout(title=f'3D plot of nearest neighbours to {word}')
    
    return fig, df