SexBot / ingestion.py
Pew404's picture
Upload folder using huggingface_hub
13fbd2e verified
from llama_index.core import SimpleDirectoryReader, get_response_synthesizer
from llama_index.core import DocumentSummaryIndex
from llama_index.llms.ollama import Ollama
from llama_index.core.node_parser import SentenceSplitter
from langchain_community.embeddings import OllamaEmbeddings
from llama_index.core import Settings
import nest_asyncio
nest_asyncio.apply()
# Load data
react_doc = SimpleDirectoryReader(input_dir="../data/pdf", file_metadata={"category": "AI applications"}).load_data()
llm = Ollama(model="llama3", request_timeout=120, base_url="http://localhost:11434")
embed_model = OllamaEmbeddings(model="llama3")
Settings.llm = llm
Settings.embed_model = embed_model
# Text Splitter
sentence_splitter = SentenceSplitter(chunk_size=500, chunk_overlap=0)
# response_mode: 检索返回node的模式
response_synthesizer = get_response_synthesizer(
response_mode="tree_summarize",
use_async=True,
)
# Buid index: 建立索引,transformations可以指定对文档进行预处理,response_synthesizer指定检索返回node的模式
index = DocumentSummaryIndex.from_documents(
react_doc,
llm=llm,
transformations=[sentence_splitter],
response_synthesizer=response_synthesizer,
show_progress=True,
)
# 存储index
index.storage_context.persist("../kb/index")