|
|
|
import os |
|
data_path = '/data1/home/purui/projects/chatbot/data' |
|
|
|
def get_all_files(path, file_type): |
|
file_paths = [] |
|
sub_path = path + f'/{file_type}' |
|
for root, dirs, files in os.walk(sub_path): |
|
for file in files: |
|
file_paths.append(os.path.join(root, file)) |
|
return file_paths |
|
|
|
from llama_index.core import SimpleDirectoryReader |
|
def load(path, file_type): |
|
docs = [] |
|
files = get_all_files(path, file_type) |
|
for file in files: |
|
file_name = file.split('/')[-1].split('.')[0] |
|
doc = SimpleDirectoryReader(input_files=[file]).load_data()[0] |
|
doc.metadata.update({"file_name": file_name}) |
|
docs.append(doc) |
|
return docs |
|
|
|
doc = load(data_path, 'pdf') |
|
print(f"Total file: {len(doc)}") |
|
|
|
from llama_index.core.node_parser import SentenceWindowNodeParser, SentenceSplitter |
|
from llama_index.llms.ollama import Ollama |
|
from langchain_community.embeddings.ollama import OllamaEmbeddings |
|
from llama_index.core import VectorStoreIndex |
|
from llama_index.core.postprocessor import MetadataReplacementPostProcessor |
|
from llama_index.core import Settings |
|
|
|
node_parser = SentenceWindowNodeParser.from_defaults( |
|
window_size=3, |
|
window_metadata_key="window", |
|
original_text_metadata_key="original_sentence", |
|
) |
|
|
|
text_splitter = SentenceSplitter() |
|
|
|
llm = Ollama(model="llama3") |
|
embed_model = OllamaEmbeddings(model="llama3") |
|
|
|
Settings.llm = llm |
|
Settings.embed_model = embed_model |
|
Settings.text_splitter = text_splitter |
|
|
|
nodes = node_parser.get_nodes_from_documents(documents=[doc[0]]) |
|
base_nodes = text_splitter.get_nodes_from_documents(documents=[doc[0]]) |
|
|
|
sentence_index = VectorStoreIndex(nodes) |
|
base_index = VectorStoreIndex(base_nodes) |
|
|
|
query_engine = sentence_index.as_query_engine( |
|
similarity_top_k=2, |
|
|
|
node_postprocessors=[ |
|
MetadataReplacementPostProcessor(target_metadata_key="window") |
|
], |
|
) |
|
window_response = query_engine.query("Who is Alice?") |
|
window = window_response.source_nodes[0].node.metadata["window"] |
|
sentence = window_response.source_nodes[0].node.metadata["original_sentence"] |
|
|
|
print(f"Window: {window}") |
|
print("------------------") |
|
print(f"Original Sentence: {sentence}") |