File size: 2,285 Bytes
13fbd2e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
# 获取路径所有文件名
import os
data_path = '/data1/home/purui/projects/chatbot/data'
def get_all_files(path, file_type):
file_paths = []
sub_path = path + f'/{file_type}'
for root, dirs, files in os.walk(sub_path):
for file in files:
file_paths.append(os.path.join(root, file))
return file_paths
from llama_index.core import SimpleDirectoryReader
def load(path, file_type):
docs = []
files = get_all_files(path, file_type)
for file in files:
file_name = file.split('/')[-1].split('.')[0]
doc = SimpleDirectoryReader(input_files=[file]).load_data()[0]
doc.metadata.update({"file_name": file_name})
docs.append(doc)
return docs
doc = load(data_path, 'pdf')
print(f"Total file: {len(doc)}")
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
from llama_index.llms.ollama import Ollama
from langchain_community.embeddings.ollama import OllamaEmbeddings
from llama_index.core import VectorStoreIndex
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import Settings
node_parser = SentenceWindowNodeParser.from_defaults(
window_size=3,
window_metadata_key="window",
original_text_metadata_key="original_sentence",
)
text_splitter = SentenceSplitter()
llm = Ollama(model="llama3")
embed_model = OllamaEmbeddings(model="llama3")
Settings.llm = llm
Settings.embed_model = embed_model
Settings.text_splitter = text_splitter
nodes = node_parser.get_nodes_from_documents(documents=[doc[0]])
base_nodes = text_splitter.get_nodes_from_documents(documents=[doc[0]])
sentence_index = VectorStoreIndex(nodes)
base_index = VectorStoreIndex(base_nodes)
query_engine = sentence_index.as_query_engine(
similarity_top_k=2,
# the target key defaults to `window` to match the node_parser's default
node_postprocessors=[
MetadataReplacementPostProcessor(target_metadata_key="window")
],
)
window_response = query_engine.query("Who is Alice?")
window = window_response.source_nodes[0].node.metadata["window"]
sentence = window_response.source_nodes[0].node.metadata["original_sentence"]
print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}") |