open-webui-rag-system / concat_vector_store.py
hugging2021's picture
Upload folder using huggingface_hub
5f3b20a verified
import os
from langchain.schema.document import Document
from e5_embeddings import E5Embeddings
from langchain_community.vectorstores import FAISS
from document_processor_image import load_documents, split_documents # ๋ฐ˜๋“œ์‹œ ์ด ํ•จ์ˆ˜๊ฐ€ ํ•„์š”
# ๊ฒฝ๋กœ ์„ค์ •
NEW_FOLDER = "25.05.28 RAG์šฉ 2์ฐจ ์—…๋ฌดํŽธ๋žŒ ์ทจํ•ฉ๋ณธ"
#NEW_FOLDER = "์ž„์‹œ"
VECTOR_STORE_PATH = "vector_db"
# 1. ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋”ฉ
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
return E5Embeddings(
model_name=model_name,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': True}
)
# 2. ๊ธฐ์กด ๋ฒกํ„ฐ ์Šคํ† ์–ด ๋กœ๋“œ
def load_vector_store(embeddings, load_path="vector_db"):
if not os.path.exists(load_path):
raise FileNotFoundError(f"๋ฒกํ„ฐ ์Šคํ† ์–ด๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค: {load_path}")
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
# 3. ๋ฌธ์„œ ์ž„๋ฒ ๋”ฉ ๋ฐ ์ถ”๊ฐ€
def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings):
print(f"๐Ÿ“‚ ์ƒˆ๋กœ์šด ๋ฌธ์„œ ๋กœ๋“œ ์ค‘: {new_folder}")
new_docs = load_documents(new_folder)
new_chunks = split_documents(new_docs, chunk_size=800, chunk_overlap=100)
print(f"๐Ÿ“„ ์ƒˆ๋กœ์šด ์ฒญํฌ ์ˆ˜: {len(new_chunks)}")
print(f"์ถ”๊ฐ€ ์ „ ๋ฒกํ„ฐ ์ˆ˜: {vectorstore.index.ntotal}")
vectorstore.add_documents(new_chunks)
print(f"์ถ”๊ฐ€ ํ›„ ๋ฒกํ„ฐ ์ˆ˜: {vectorstore.index.ntotal}")
print("โœ… ์ƒˆ๋กœ์šด ๋ฌธ์„œ๊ฐ€ ๋ฒกํ„ฐ ์Šคํ† ์–ด์— ์ถ”๊ฐ€๋˜์—ˆ์Šต๋‹ˆ๋‹ค.")
# 4. ์ „์ฒด ์‹คํ–‰
if __name__ == "__main__":
embeddings = get_embeddings()
vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH)
add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings)
vectorstore.save_local(VECTOR_STORE_PATH)
print(f"๐Ÿ’พ ๋ฒกํ„ฐ ์Šคํ† ์–ด ์ €์žฅ ์™„๋ฃŒ: {VECTOR_STORE_PATH}")