File size: 2,941 Bytes
bed7d75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa95f1b
 
bed7d75
 
 
 
 
 
 
 
 
 
aa95f1b
bed7d75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#from google.colab import userdata
import kuzu
import logging
import sys
import os
from llama_index.graph_stores.kuzu import KuzuGraphStore
from llama_index.core import (
    SimpleDirectoryReader,
    ServiceContext,
    KnowledgeGraphIndex,
)
from llama_index.readers.web import SimpleWebPageReader


from llama_index.llms.openai import OpenAI
from IPython.display import Markdown, display
from llama_index.core.storage.storage_context import StorageContext

from pyvis.network import Network
import pandas as pd
import numpy as np
import plotly.express as px
import umap

def get_index(links,token):
    #os.environ["OPENAI_API_KEY"] = userdata.get('oai')
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    
    db = kuzu.Database("kg1")
    graph_store = KuzuGraphStore(db)
    
    
    documents = SimpleWebPageReader(html_to_text=True).load_data(
        links
    )
    
    llm = OpenAI(temperature=0, model="gpt-3.5-turbo",api_key=token)
    service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)
    
    storage_context = StorageContext.from_defaults(graph_store=graph_store)
    
    # NOTE: can take a while!
    index = KnowledgeGraphIndex.from_documents(documents=documents,
                                               max_triplets_per_chunk=5,
                                               storage_context=storage_context,
                                               service_context=service_context,
                                               show_progress=True,
                                               include_embeddings=True)

    return index

def get_network_graph(index):
    g = index.get_networkx_graph()
    net = Network(notebook=True, cdn_resources="in_line", directed=True)
    net.from_nx(g)
    net.show("kuzugraph_draw3.html")
    net.save_graph("kuzugraph_draw3.html")


def get_embeddings(index):
    embeddings = index.index_struct.to_dict()
    embeddings_df = pd.DataFrame.from_dict(embeddings)['embedding_dict']
    embeddings_df = embeddings_df.dropna()
    return embeddings_df


def get_visualize_embeddings(embedding_series, n_neighbors=15, min_dist=0.1, n_components=2):
    # Convert Series to DataFrame
    embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))])

    # Perform UMAP dimensionality reduction
    umap_embedded = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        random_state=42,
    ).fit_transform(embedding_df.values)

    # Plot the UMAP embedding
    umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2'])
    umap_df['Label'] = embedding_series.index
    # Plot the UMAP embedding using Plotly Express
    fig = px.scatter(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',hover_data=['Label'], title='UMAP Visualization of Embeddings')
    return fig