Spaces:
Sleeping
Sleeping
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch | |
from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever | |
import os | |
from dotenv import load_dotenv | |
load_dotenv() | |
# ---- MongoDB credentials ---- | |
# mongo_username = os.getenv('MONGO_USERNAME') | |
# mongo_password = os.getenv('MONGO_PASSWORD') | |
mongo_database = os.getenv('MONGO_DATABASE') | |
mongo_connection_str = os.getenv('MONGO_CONNECTION_STRING') | |
mongo_collection_name = os.getenv('MONGO_COLLECTION') | |
# ---- Common Configurations & Hybrid Retrieval Configuration ---- | |
MODEL_KWARGS = {"device": "cpu"} | |
ENCODE_KWARGS = {"normalize_embeddings": True, | |
"batch_size": 32} | |
EMBEDDING_DIMENSIONS = 1024 | |
MODEL_NAME = "BAAI/bge-m3" | |
FINAL_TOP_K = 50 # 30 | |
HYBRID_FULLTEXT_PENALTY = 0 # 60 | |
HYBRID_VECTOR_PENALTY = 0.8 # 60 | |
# ---- Embedding model ---- | |
embed_model = HuggingFaceEmbeddings( | |
model_name=MODEL_NAME, | |
model_kwargs=MODEL_KWARGS, | |
encode_kwargs=ENCODE_KWARGS | |
) | |
# ---- Vectore Search ---- | |
num_vector_candidates = max(20, 2 * FINAL_TOP_K) | |
num_text_candidates = max(20, 2 * FINAL_TOP_K) | |
vector_k = num_vector_candidates | |
vector_num_candidates_for_operator = vector_k * 10 | |
# ---- Vectore Store ---- | |
vector_store = MongoDBAtlasVectorSearch.from_connection_string( | |
connection_string=mongo_connection_str, | |
namespace=f"{mongo_database}.{mongo_collection_name}", | |
embedding=embed_model, | |
index_name="search_index_v1", | |
) | |
# ---- Retriever (Hybrid) ---- | |
def get_retriever(**kwargs): | |
""" | |
สร้าง Retriever โดยสามารถรับ filter พิเศษสำหรับ Vector Search ได้ | |
""" | |
# ดึง vector_search_filter ออกมาจาก kwargs | |
vector_search_filter = kwargs.pop('vector_search_filter', None) | |
# kwargs ที่เหลือ (ถ้ามี) จะถูกใช้เป็น pre_filter | |
pre_filter = kwargs if kwargs else None | |
retriever = MongoDBAtlasHybridSearchRetriever( | |
vectorstore=vector_store, | |
search_index_name='search_index_v1', | |
embedding=embed_model, | |
text_key= 'text', | |
embedding_key='embedding', | |
top_k=FINAL_TOP_K, | |
vector_penalty=HYBRID_VECTOR_PENALTY, | |
fulltext_penalty=HYBRID_FULLTEXT_PENALTY, | |
vector_search_params={ | |
"k": vector_k, | |
"numCandidates": vector_num_candidates_for_operator, | |
# --- ส่ง filter ที่ถูกต้องเข้าไปในตำแหน่งที่ถูกต้อง --- | |
"filter": vector_search_filter | |
}, | |
text_search_params={ | |
"limit": max(20, 2 * FINAL_TOP_K) | |
}, | |
pre_filter=pre_filter | |
) | |
return retriever | |
# def get_retriever(**kwargs): | |
# retriever = MongoDBAtlasHybridSearchRetriever( | |
# vectorstore=vector_store, | |
# search_index_name='search_index_v1', | |
# embedding=embed_model, | |
# text_key= 'text', #'token', | |
# embedding_key='embedding', | |
# top_k=FINAL_TOP_K, | |
# vector_penalty=HYBRID_VECTOR_PENALTY, | |
# fulltext_penalty=HYBRID_FULLTEXT_PENALTY, | |
# vector_search_params={ | |
# "k": vector_k, | |
# "numCandidates": vector_num_candidates_for_operator | |
# }, | |
# text_search_params={ | |
# "limit": num_text_candidates | |
# }, | |
# pre_filter=kwargs | |
# ) | |
# return retriever | |
# ---------- FILTER METAAAAA ---------- | |
# ---------- FILTER METAAAAA ---------- | |
# ---------- FILTER METAAAAA ---------- | |
# ---------- FILTER METAAAAA ---------- | |
# def get_retriever(**kwargs): | |
# # ดึง filter ที่เราจะส่งมาจาก tool ออกมาจาก kwargs | |
# # เราใช้ .pop() เพื่อเอามันออกมา จะได้ไม่ถูกส่งไปที่ pre_filter ซ้ำซ้อน | |
# search_filter = kwargs.pop('filter', None) | |
# retriever = MongoDBAtlasHybridSearchRetriever( | |
# vectorstore=vector_store, | |
# search_index_name='search_index_v1', | |
# embedding=embed_model, | |
# text_key= 'text', | |
# embedding_key='embedding', | |
# top_k=FINAL_TOP_K, | |
# vector_penalty=HYBRID_VECTOR_PENALTY, | |
# fulltext_penalty=HYBRID_FULLTEXT_PENALTY, | |
# vector_search_params={ | |
# "k": vector_k, | |
# "numCandidates": vector_num_candidates_for_operator, | |
# "filter": search_filter | |
# }, | |
# text_search_params={ | |
# "limit": num_text_candidates | |
# }, | |
# pre_filter=kwargs | |
# ) | |
# return retriever | |