File size: 1,382 Bytes
60e4d0e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import json
import os
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
# 🔹 Daha güçlü bir embedding modeli kullanarak eşleşmeleri iyileştiriyoruz
embedding_model = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-mpnet-base-v2",
encode_kwargs={"normalize_embeddings": True}
)
# 🔹 Q&A verisini yükleyip vektör veritabanı oluştur
def load_qa_and_create_vectorstore():
with open("Q&A_cleaned.json", "r", encoding="utf-8") as f:
qa_data = json.load(f)
documents = [
Document(
page_content=f"Question: {item['QUESTION']}\nAnswer: {item['ANSWER']}",
metadata={}
)
for item in qa_data
]
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
split_docs = text_splitter.split_documents(documents)
persist_directory = "./vistula_chroma"
# 🔹 Eğer eski veritabanı varsa, yeni veriyle yeniden oluştur
if os.path.exists(persist_directory):
os.system("rm -rf vistula_chroma") # Eski vektör veritabanını siliyoruz
vectordb = Chroma.from_documents(split_docs, embedding_model, persist_directory=persist_directory)
return vectordb.as_retriever()
|