Spaces:
Runtime error
Runtime error
File size: 1,914 Bytes
5f3b20a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 |
import os
from langchain.schema.document import Document
from e5_embeddings import E5Embeddings
from langchain_community.vectorstores import FAISS
from document_processor_image import load_documents, split_documents # ๋ฐ๋์ ์ด ํจ์๊ฐ ํ์
# ๊ฒฝ๋ก ์ค์
NEW_FOLDER = "25.05.28 RAG์ฉ 2์ฐจ ์
๋ฌดํธ๋ ์ทจํฉ๋ณธ"
#NEW_FOLDER = "์์"
VECTOR_STORE_PATH = "vector_db"
# 1. ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋ฉ
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
return E5Embeddings(
model_name=model_name,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': True}
)
# 2. ๊ธฐ์กด ๋ฒกํฐ ์คํ ์ด ๋ก๋
def load_vector_store(embeddings, load_path="vector_db"):
if not os.path.exists(load_path):
raise FileNotFoundError(f"๋ฒกํฐ ์คํ ์ด๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค: {load_path}")
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
# 3. ๋ฌธ์ ์๋ฒ ๋ฉ ๋ฐ ์ถ๊ฐ
def add_new_documents_to_vector_store(new_folder, vectorstore, embeddings):
print(f"๐ ์๋ก์ด ๋ฌธ์ ๋ก๋ ์ค: {new_folder}")
new_docs = load_documents(new_folder)
new_chunks = split_documents(new_docs, chunk_size=800, chunk_overlap=100)
print(f"๐ ์๋ก์ด ์ฒญํฌ ์: {len(new_chunks)}")
print(f"์ถ๊ฐ ์ ๋ฒกํฐ ์: {vectorstore.index.ntotal}")
vectorstore.add_documents(new_chunks)
print(f"์ถ๊ฐ ํ ๋ฒกํฐ ์: {vectorstore.index.ntotal}")
print("โ
์๋ก์ด ๋ฌธ์๊ฐ ๋ฒกํฐ ์คํ ์ด์ ์ถ๊ฐ๋์์ต๋๋ค.")
# 4. ์ ์ฒด ์คํ
if __name__ == "__main__":
embeddings = get_embeddings()
vectorstore = load_vector_store(embeddings, VECTOR_STORE_PATH)
add_new_documents_to_vector_store(NEW_FOLDER, vectorstore, embeddings)
vectorstore.save_local(VECTOR_STORE_PATH)
print(f"๐พ ๋ฒกํฐ ์คํ ์ด ์ ์ฅ ์๋ฃ: {VECTOR_STORE_PATH}")
|