SexBot / prepare_vectorstore.py
Pew404's picture
Upload folder using huggingface_hub
13fbd2e verified
# 获取路径所有文件名
import os
data_path = '/data1/home/purui/projects/chatbot/data'
def get_all_files(path, file_type):
file_paths = []
sub_path = path + f'/{file_type}'
for root, dirs, files in os.walk(sub_path):
for file in files:
file_paths.append(os.path.join(root, file))
return file_paths
from llama_index.core import SimpleDirectoryReader
def load(path, file_type):
docs = []
files = get_all_files(path, file_type)
for file in files:
file_name = file.split('/')[-1].split('.')[0]
doc = SimpleDirectoryReader(input_files=[file]).load_data()[0]
doc.metadata.update({"file_name": file_name})
docs.append(doc)
return docs
doc = load(data_path, 'pdf')
print(f"Total file: {len(doc)}")
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
from llama_index.llms.ollama import Ollama
from langchain_community.embeddings.ollama import OllamaEmbeddings
from llama_index.core import VectorStoreIndex
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import Settings
node_parser = SentenceWindowNodeParser.from_defaults(
window_size=3,
window_metadata_key="window",
original_text_metadata_key="original_sentence",
)
text_splitter = SentenceSplitter()
llm = Ollama(model="llama3")
embed_model = OllamaEmbeddings(model="llama3")
Settings.llm = llm
Settings.embed_model = embed_model
Settings.text_splitter = text_splitter
nodes = node_parser.get_nodes_from_documents(documents=[doc[0]])
base_nodes = text_splitter.get_nodes_from_documents(documents=[doc[0]])
sentence_index = VectorStoreIndex(nodes)
base_index = VectorStoreIndex(base_nodes)
query_engine = sentence_index.as_query_engine(
similarity_top_k=2,
# the target key defaults to `window` to match the node_parser's default
node_postprocessors=[
MetadataReplacementPostProcessor(target_metadata_key="window")
],
)
window_response = query_engine.query("Who is Alice?")
window = window_response.source_nodes[0].node.metadata["window"]
sentence = window_response.source_nodes[0].node.metadata["original_sentence"]
print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}")