Spaces:
Sleeping
Sleeping
File size: 5,118 Bytes
bed7d75 beaea9f bed7d75 beaea9f bed7d75 beaea9f bed7d75 beaea9f bed7d75 b1a0c4c c84d527 22a8aa1 b1a0c4c bb31fc3 beaea9f bb31fc3 beaea9f bed7d75 beaea9f bed7d75 aa95f1b bed7d75 beaea9f bed7d75 beaea9f bed7d75 beaea9f bed7d75 beaea9f bed7d75 beaea9f bed7d75 beaea9f bb31fc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import kuzu
import logging
import sys
import os
#import llama_index
from llama_index.graph_stores import KuzuGraphStore
from llama_index import (
SimpleDirectoryReader,
ServiceContext,
KnowledgeGraphIndex,
)
from llama_index.readers import SimpleWebPageReader
from llama_index.indices.loading import load_index_from_storage
from llama_index.llms import OpenAI
from IPython.display import Markdown, display
from llama_index.storage.storage_context import StorageContext
from pyvis.network import Network
import pandas as pd
import numpy as np
import plotly.express as px
import umap
def make_dir():
if(not os.path.exists("data")):
os.mkdir('data')
def save_uploadedfile(uploadedfile):
with open(os.path.join("data",uploadedfile.name),"wb") as f:
f.write(uploadedfile.getbuffer())
def load_index(token,name):
os.environ["OPENAI_API_KEY"] = token
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
db = kuzu.Database(name+"/kg")
graph_store = KuzuGraphStore(db)
llm = OpenAI(temperature=0, model="gpt-3.5-turbo",api_key=token)
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)
storage_context = StorageContext.from_defaults(graph_store=graph_store,persist_dir=name+"/storage")
index = load_index_from_storage(storage_context=storage_context,service_context=service_context)
return index
def get_index_pdf(token,name):
documents = SimpleDirectoryReader("./data").load_data()
print(documents)
print(documents)
os.mkdir(name)
os.environ["OPENAI_API_KEY"] = token
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
db = kuzu.Database(name+"/kg")
graph_store = KuzuGraphStore(db)
llm = OpenAI(temperature=0, model="gpt-3.5-turbo",api_key=token)
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)
storage_context = StorageContext.from_defaults(graph_store=graph_store)
index = KnowledgeGraphIndex.from_documents(documents=documents,
max_triplets_per_chunk=2,
storage_context=storage_context,
service_context=service_context,
show_progress=True,
include_embeddings=True)
index.storage_context.persist(name+"/storage")
return index
def get_index(links,token,name):
os.mkdir(name)
os.environ["OPENAI_API_KEY"] = token
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
db = kuzu.Database(name+"/kg")
graph_store = KuzuGraphStore(db)
documents = SimpleWebPageReader(html_to_text=True).load_data(
links
)
llm = OpenAI(temperature=0, model="gpt-3.5-turbo",api_key=token)
service_context = ServiceContext.from_defaults(llm=llm, chunk_size=512)
storage_context = StorageContext.from_defaults(graph_store=graph_store)
# NOTE: can take a while!
index = KnowledgeGraphIndex.from_documents(documents=documents,
max_triplets_per_chunk=2,
storage_context=storage_context,
service_context=service_context,
show_progress=True,
include_embeddings=True)
index.storage_context.persist(name+"/storage")
return index
def get_network_graph(index):
g = index.get_networkx_graph()
net = Network(directed=True)
net.from_nx(g)
# net.show("kuzugraph_draw3.html")
net.save_graph("kuzugraph_draw3.html")
def get_embeddings(index):
embeddings = index.index_struct.to_dict()
embeddings_df = pd.DataFrame.from_dict(embeddings)['embedding_dict']
embeddings_df = embeddings_df.dropna()
return embeddings_df
def get_visualize_embeddings(embedding_series, n_neighbors=15, min_dist=0.1, n_components=2):
# Convert Series to DataFrame
embedding_df = pd.DataFrame(embedding_series.tolist(), columns=[f'dim_{i+1}' for i in range(len(embedding_series[0]))])
# Perform UMAP dimensionality reduction
umap_embedded = umap.UMAP(
n_neighbors=n_neighbors,
min_dist=min_dist,
n_components=n_components,
random_state=42,
).fit_transform(embedding_df.values)
# Plot the UMAP embedding
umap_df = pd.DataFrame(umap_embedded, columns=['UMAP Dimension 1', 'UMAP Dimension 2'])
umap_df['Label'] = embedding_series.index
# Plot the UMAP embedding using Plotly Express
fig = px.scatter(umap_df, x='UMAP Dimension 1', y='UMAP Dimension 2',hover_data=['Label'], title='UMAP Visualization of Embeddings')
return fig
def query_model(index,user_query):
query_engine = index.as_query_engine(
include_text=True,
response_mode="tree_summarize",
embedding_mode="hybrid",
similarity_top_k=5,
)
response = query_engine.query(user_query)
return response |