from llama_index.core import SimpleDirectoryReader, get_response_synthesizer
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.ollama import Ollama
from llama_index.core.node_parser import SentenceSplitter
from langchain_community.embeddings import OllamaEmbeddings
from llama_index.core import Settings
import nest_asyncio

nest_asyncio.apply()

# Load data
react_doc = SimpleDirectoryReader(input_dir="../data/pdf", file_metadata={"category": "AI applications"}).load_data()

llm = Ollama(model="llama3", request_timeout=120, base_url="http://localhost:11434")
embed_model = OllamaEmbeddings(model="llama3")
Settings.llm = llm
Settings.embed_model = embed_model

# Text Splitter
sentence_splitter = SentenceSplitter(chunk_size=500, chunk_overlap=0)

# response_mode: 检索返回node的模式
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
    use_async=True,
)

# Buid index： 建立索引，transformations可以指定对文档进行预处理，response_synthesizer指定检索返回node的模式
index = DocumentSummaryIndex.from_documents(
    react_doc,
    llm=llm,
    transformations=[sentence_splitter],
    response_synthesizer=response_synthesizer,
    show_progress=True,
)

# 存储index
index.storage_context.persist("../kb/index")