Spaces:
Sleeping
Sleeping
#from google.colab import userdata | |
import kuzu | |
import logging | |
import sys | |
import os | |
from llama_index.graph_stores.kuzu import KuzuGraphStore | |
from llama_index.core import ( | |
SimpleDirectoryReader, | |
ServiceContext, | |
KnowledgeGraphIndex, | |
) | |
from llama_index.readers.web import SimpleWebPageReader | |
from llama_index.llms.openai import OpenAI | |
from IPython.display import Markdown, display | |
from llama_index.core.storage.storage_context import StorageContext | |
from pyvis.network import Network | |
import pandas as pd | |
import numpy as np | |
import plotly.express as px | |
import umap | |
def get_index(links): | |
os.environ["OPENAI_API_KEY"] = userdata.get('oai') | |
logging.basicConfig(stream=sys.stdout, level=logging.INFO) | |
db = kuzu.Database("kg1") | |
graph_store = KuzuGraphStore(db) | |
documents = SimpleWebPageReader(html_to_text=True).load_data( | |
links | |
) | |
llm = OpenAI(temperature=0, model="gpt-3.5-turbo",api_key='') | |
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512) | |
storage_context = StorageContext.from_defaults(graph_store=graph_store) | |
# NOTE: can take a while! | |
index = KnowledgeGraphIndex.from_documents(documents=documents, | |
max_triplets_per_chunk=5, | |
storage_context=storage_context, | |
service_context=service_context, | |
show_progress=True, | |
include_embeddings=True) | |
return index | |
def get_network_graph(index): | |
g = index.get_networkx_graph() | |
net = Network(notebook=True, cdn_resources="in_line", directed=True) | |
net.from_nx(g) | |
net.show("kuzugraph_draw3.html") | |
net.save_graph("kuzugraph_draw3.html") | |
def get_embeddings(index): | |
embeddings = index.index_struct.to_dict() | |
embeddings_df = pd.DataFrame.from_dict(embeddings)['embedding_dict'] | |
embeddings_df = embeddings_df.dropna() | |
return embeddings_df | |
def get_visualize_embeddings(embedding_series, n_neighbors=15, min_dist=0.1, n_components=2): | |
# Convert Series to DataFrame | |
embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))]) | |
# Perform UMAP dimensionality reduction | |
umap_embedded = umap.UMAP( | |
n_neighbors=n_neighbors, | |
min_dist=min_dist, | |
n_components=n_components, | |
random_state=42, | |
).fit_transform(embedding_df.values) | |
# Plot the UMAP embedding | |
umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2']) | |
umap_df['Label'] = embedding_series.index | |
# Plot the UMAP embedding using Plotly Express | |
fig = px.scatter(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',hover_data=['Label'], title='UMAP Visualization of Embeddings') | |
return fig | |