from llama_index.core import SimpleDirectoryReader, get_response_synthesizer | |
from llama_index.core import DocumentSummaryIndex | |
from llama_index.llms.ollama import Ollama | |
from llama_index.core.node_parser import SentenceSplitter | |
from langchain_community.embeddings import OllamaEmbeddings | |
from llama_index.core import Settings | |
import nest_asyncio | |
nest_asyncio.apply() | |
# Load data | |
react_doc = SimpleDirectoryReader(input_dir="../data/pdf", file_metadata={"category": "AI applications"}).load_data() | |
llm = Ollama(model="llama3", request_timeout=120, base_url="http://localhost:11434") | |
embed_model = OllamaEmbeddings(model="llama3") | |
Settings.llm = llm | |
Settings.embed_model = embed_model | |
# Text Splitter | |
sentence_splitter = SentenceSplitter(chunk_size=500, chunk_overlap=0) | |
# response_mode: 检索返回node的模式 | |
response_synthesizer = get_response_synthesizer( | |
response_mode="tree_summarize", | |
use_async=True, | |
) | |
# Buid index: 建立索引,transformations可以指定对文档进行预处理,response_synthesizer指定检索返回node的模式 | |
index = DocumentSummaryIndex.from_documents( | |
react_doc, | |
llm=llm, | |
transformations=[sentence_splitter], | |
response_synthesizer=response_synthesizer, | |
show_progress=True, | |
) | |
# 存储index | |
index.storage_context.persist("../kb/index") | |