File size: 2,285 Bytes
13fbd2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# 获取路径所有文件名
import os
data_path = '/data1/home/purui/projects/chatbot/data'

def get_all_files(path, file_type):
    file_paths = []
    sub_path = path + f'/{file_type}'
    for root, dirs, files in os.walk(sub_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths

from llama_index.core import SimpleDirectoryReader
def load(path, file_type):
    docs = []
    files = get_all_files(path, file_type)
    for file in files:
        file_name = file.split('/')[-1].split('.')[0]
        doc = SimpleDirectoryReader(input_files=[file]).load_data()[0]
        doc.metadata.update({"file_name": file_name})
        docs.append(doc)
    return docs
        
doc = load(data_path, 'pdf')
print(f"Total file: {len(doc)}")

from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter
from llama_index.llms.ollama import Ollama
from langchain_community.embeddings.ollama import OllamaEmbeddings
from llama_index.core import VectorStoreIndex
from llama_index.core.postprocessor import MetadataReplacementPostProcessor
from llama_index.core import Settings

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=3,
    window_metadata_key="window",
    original_text_metadata_key="original_sentence",
)

text_splitter = SentenceSplitter()

llm = Ollama(model="llama3")
embed_model = OllamaEmbeddings(model="llama3")

Settings.llm = llm
Settings.embed_model = embed_model
Settings.text_splitter = text_splitter

nodes = node_parser.get_nodes_from_documents(documents=[doc[0]])
base_nodes = text_splitter.get_nodes_from_documents(documents=[doc[0]])

sentence_index = VectorStoreIndex(nodes)
base_index = VectorStoreIndex(base_nodes)

query_engine = sentence_index.as_query_engine(
    similarity_top_k=2,
    # the target key defaults to `window` to match the node_parser's default
    node_postprocessors=[
        MetadataReplacementPostProcessor(target_metadata_key="window")
    ],
)
window_response = query_engine.query("Who is Alice?")
window = window_response.source_nodes[0].node.metadata["window"]
sentence = window_response.source_nodes[0].node.metadata["original_sentence"]

print(f"Window: {window}")
print("------------------")
print(f"Original Sentence: {sentence}")