Spaces:
Runtime error
Runtime error
File size: 4,478 Bytes
5f3b20a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
๋ฒกํฐ ์คํ ์ด ๋ชจ๋: ๋ฌธ์ ์๋ฒ ๋ฉ ์์ฑ ๋ฐ ๋ฒกํฐ ์คํ ์ด ๊ตฌ์ถ
๋ฐฐ์น ์ฒ๋ฆฌ ์ ์ฉ์ผ๋ก ๋ฉ๋ชจ๋ฆฌ ์ฌ์ฉ๋ ์ต์ ํ + ๊ธด ์ฒญํฌ ์ค๋ฅ ๋ฐฉ์ง
"""
import os
import argparse
import logging
from tqdm import tqdm
from langchain_community.vectorstores import FAISS
from langchain.schema.document import Document
from langchain_huggingface import HuggingFaceEmbeddings
# ๋ก๊น
์ค์ - ๋ถํ์ํ ๊ฒฝ๊ณ ๋ฉ์์ง ์ ๊ฑฐ
logging.getLogger().setLevel(logging.ERROR)
def get_embeddings(model_name="intfloat/multilingual-e5-large-instruct", device="cuda"):
return HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs={'device': device},
encode_kwargs={'normalize_embeddings': True}
)
def build_vector_store_batch(documents, embeddings, save_path="vector_db", batch_size=16):
if not documents:
raise ValueError("๋ฌธ์๊ฐ ์์ต๋๋ค. ๋ฌธ์๊ฐ ์ฌ๋ฐ๋ฅด๊ฒ ๋ก๋๋์๋์ง ํ์ธํ์ธ์.")
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]
# ๋ฐฐ์น๋ก ๋ถํ
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
metadata_batches = [metadatas[i:i + batch_size] for i in range(0, len(metadatas), batch_size)]
print(f"Processing {len(batches)} batches with size {batch_size}")
print(f"Initializing vector store with batch 1/{len(batches)}")
# โ
from_texts ๋์ from_documents ์ฌ์ฉ (๊ธธ์ด ๋ฌธ์ ๋ฐฉ์ง)
first_docs = [
Document(page_content=text, metadata=meta)
for text, meta in zip(batches[0], metadata_batches[0])
]
vectorstore = FAISS.from_documents(first_docs, embeddings)
# ๋๋จธ์ง ๋ฐฐ์น ์ถ๊ฐ
for i in tqdm(range(1, len(batches)), desc="Processing batches"):
try:
docs_batch = [
Document(page_content=text, metadata=meta)
for text, meta in zip(batches[i], metadata_batches[i])
]
vectorstore.add_documents(docs_batch)
if i % 10 == 0:
temp_save_path = f"{save_path}_temp"
os.makedirs(os.path.dirname(temp_save_path) if os.path.dirname(temp_save_path) else '.', exist_ok=True)
vectorstore.save_local(temp_save_path)
print(f"Temporary vector store saved to {temp_save_path} after batch {i}")
except Exception as e:
print(f"Error processing batch {i}: {e}")
error_save_path = f"{save_path}_error_at_batch_{i}"
os.makedirs(os.path.dirname(error_save_path) if os.path.dirname(error_save_path) else '.', exist_ok=True)
vectorstore.save_local(error_save_path)
print(f"Partial vector store saved to {error_save_path}")
raise
os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True)
vectorstore.save_local(save_path)
print(f"Vector store saved to {save_path}")
return vectorstore
def load_vector_store(embeddings, load_path="vector_db"):
if not os.path.exists(load_path):
raise FileNotFoundError(f"๋ฒกํฐ ์คํ ์ด๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค: {load_path}")
return FAISS.load_local(load_path, embeddings, allow_dangerous_deserialization=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="๋ฒกํฐ ์คํ ์ด ๊ตฌ์ถ")
parser.add_argument("--folder", type=str, default="dataset", help="๋ฌธ์๊ฐ ์๋ ํด๋ ๊ฒฝ๋ก")
parser.add_argument("--save_path", type=str, default="vector_db", help="๋ฒกํฐ ์คํ ์ด ์ ์ฅ ๊ฒฝ๋ก")
parser.add_argument("--batch_size", type=int, default=16, help="๋ฐฐ์น ํฌ๊ธฐ")
parser.add_argument("--model_name", type=str, default="intfloat/multilingual-e5-large-instruct", help="์๋ฒ ๋ฉ ๋ชจ๋ธ ์ด๋ฆ")
parser.add_argument("--device", type=str, default="cuda", help="์ฌ์ฉํ ๋๋ฐ์ด์ค ('cuda' ๋๋ 'cpu')")
args = parser.parse_args()
# ๋ฌธ์ ์ฒ๋ฆฌ ๋ชจ๋ import
from document_processor import load_documents, split_documents
# ๋ฌธ์ ๋ก๋ ๋ฐ ๋ถํ
documents = load_documents(args.folder)
chunks = split_documents(documents, chunk_size=800, chunk_overlap=100)
# ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋
embeddings = get_embeddings(model_name=args.model_name, device=args.device)
# ๋ฒกํฐ ์คํ ์ด ๊ตฌ์ถ
build_vector_store_batch(chunks, embeddings, args.save_path, args.batch_size)
|