Spaces:
Sleeping
Sleeping
File size: 4,730 Bytes
30adccc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever
import os
from dotenv import load_dotenv
load_dotenv()
# ---- MongoDB credentials ----
# mongo_username = os.getenv('MONGO_USERNAME')
# mongo_password = os.getenv('MONGO_PASSWORD')
mongo_database = os.getenv('MONGO_DATABASE')
mongo_connection_str = os.getenv('MONGO_CONNECTION_STRING')
mongo_collection_name = os.getenv('MONGO_COLLECTION')
# ---- Common Configurations & Hybrid Retrieval Configuration ----
MODEL_KWARGS = {"device": "cpu"}
ENCODE_KWARGS = {"normalize_embeddings": True,
"batch_size": 32}
EMBEDDING_DIMENSIONS = 1024
MODEL_NAME = "BAAI/bge-m3"
FINAL_TOP_K = 50 # 30
HYBRID_FULLTEXT_PENALTY = 0 # 60
HYBRID_VECTOR_PENALTY = 0.8 # 60
# ---- Embedding model ----
embed_model = HuggingFaceEmbeddings(
model_name=MODEL_NAME,
model_kwargs=MODEL_KWARGS,
encode_kwargs=ENCODE_KWARGS
)
# ---- Vectore Search ----
num_vector_candidates = max(20, 2 * FINAL_TOP_K)
num_text_candidates = max(20, 2 * FINAL_TOP_K)
vector_k = num_vector_candidates
vector_num_candidates_for_operator = vector_k * 10
# ---- Vectore Store ----
vector_store = MongoDBAtlasVectorSearch.from_connection_string(
connection_string=mongo_connection_str,
namespace=f"{mongo_database}.{mongo_collection_name}",
embedding=embed_model,
index_name="search_index_v1",
)
# ---- Retriever (Hybrid) ----
def get_retriever(**kwargs):
"""
สร้าง Retriever โดยสามารถรับ filter พิเศษสำหรับ Vector Search ได้
"""
# ดึง vector_search_filter ออกมาจาก kwargs
vector_search_filter = kwargs.pop('vector_search_filter', None)
# kwargs ที่เหลือ (ถ้ามี) จะถูกใช้เป็น pre_filter
pre_filter = kwargs if kwargs else None
retriever = MongoDBAtlasHybridSearchRetriever(
vectorstore=vector_store,
search_index_name='search_index_v1',
embedding=embed_model,
text_key= 'text',
embedding_key='embedding',
top_k=FINAL_TOP_K,
vector_penalty=HYBRID_VECTOR_PENALTY,
fulltext_penalty=HYBRID_FULLTEXT_PENALTY,
vector_search_params={
"k": vector_k,
"numCandidates": vector_num_candidates_for_operator,
# --- ส่ง filter ที่ถูกต้องเข้าไปในตำแหน่งที่ถูกต้อง ---
"filter": vector_search_filter
},
text_search_params={
"limit": max(20, 2 * FINAL_TOP_K)
},
pre_filter=pre_filter
)
return retriever
# def get_retriever(**kwargs):
# retriever = MongoDBAtlasHybridSearchRetriever(
# vectorstore=vector_store,
# search_index_name='search_index_v1',
# embedding=embed_model,
# text_key= 'text', #'token',
# embedding_key='embedding',
# top_k=FINAL_TOP_K,
# vector_penalty=HYBRID_VECTOR_PENALTY,
# fulltext_penalty=HYBRID_FULLTEXT_PENALTY,
# vector_search_params={
# "k": vector_k,
# "numCandidates": vector_num_candidates_for_operator
# },
# text_search_params={
# "limit": num_text_candidates
# },
# pre_filter=kwargs
# )
# return retriever
# ---------- FILTER METAAAAA ----------
# ---------- FILTER METAAAAA ----------
# ---------- FILTER METAAAAA ----------
# ---------- FILTER METAAAAA ----------
# def get_retriever(**kwargs):
# # ดึง filter ที่เราจะส่งมาจาก tool ออกมาจาก kwargs
# # เราใช้ .pop() เพื่อเอามันออกมา จะได้ไม่ถูกส่งไปที่ pre_filter ซ้ำซ้อน
# search_filter = kwargs.pop('filter', None)
# retriever = MongoDBAtlasHybridSearchRetriever(
# vectorstore=vector_store,
# search_index_name='search_index_v1',
# embedding=embed_model,
# text_key= 'text',
# embedding_key='embedding',
# top_k=FINAL_TOP_K,
# vector_penalty=HYBRID_VECTOR_PENALTY,
# fulltext_penalty=HYBRID_FULLTEXT_PENALTY,
# vector_search_params={
# "k": vector_k,
# "numCandidates": vector_num_candidates_for_operator,
# "filter": search_filter
# },
# text_search_params={
# "limit": num_text_candidates
# },
# pre_filter=kwargs
# )
# return retriever
|