Spaces:
Sleeping
Sleeping
File size: 2,983 Bytes
bed7d75 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
#from google.colab import userdata
import kuzu
import logging
import sys
import os
from llama_index.graph_stores.kuzu import KuzuGraphStore
from llama_index.core import (
SimpleDirectoryReader,
ServiceContext,
KnowledgeGraphIndex,
)
from llama_index.readers.web import SimpleWebPageReader
from llama_index.llms.openai import OpenAI
from IPython.display import Markdown, display
from llama_index.core.storage.storage_context import StorageContext
from pyvis.network import Network
import pandas as pd
import numpy as np
import plotly.express as px
import umap
def get_index(links):
os.environ["OPENAI_API_KEY"] = userdata.get('oai')
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
db = kuzu.Database("kg1")
graph_store = KuzuGraphStore(db)
documents = SimpleWebPageReader(html_to_text=True).load_data(
links
)
llm = OpenAI(temperature=0, model="gpt-3.5-turbo",api_key='')
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)
storage_context = StorageContext.from_defaults(graph_store=graph_store)
# NOTE: can take a while!
index = KnowledgeGraphIndex.from_documents(documents=documents,
max_triplets_per_chunk=5,
storage_context=storage_context,
service_context=service_context,
show_progress=True,
include_embeddings=True)
return index
def get_network_graph(index):
g = index.get_networkx_graph()
net = Network(notebook=True, cdn_resources="in_line", directed=True)
net.from_nx(g)
net.show("kuzugraph_draw3.html")
net.save_graph("kuzugraph_draw3.html")
def get_embeddings(index):
embeddings = index.index_struct.to_dict()
embeddings_df = pd.DataFrame.from_dict(embeddings)['embedding_dict']
embeddings_df = embeddings_df.dropna()
return embeddings_df
def get_visualize_embeddings(embedding_series, n_neighbors=15, min_dist=0.1, n_components=2):
# Convert Series to DataFrame
embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))])
# Perform UMAP dimensionality reduction
umap_embedded = umap.UMAP(
n_neighbors=n_neighbors,
min_dist=min_dist,
n_components=n_components,
random_state=42,
).fit_transform(embedding_df.values)
# Plot the UMAP embedding
umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2'])
umap_df['Label'] = embedding_series.index
# Plot the UMAP embedding using Plotly Express
fig = px.scatter(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',hover_data=['Label'], title='UMAP Visualization of Embeddings')
return fig
|