poc_hb / get_retriever_2.py
Ing's picture
app
e061ff7
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever
load_dotenv()
mongo_username = os.getenv('MONGO_USERNAME')
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_database = os.getenv('MONGO_DATABASE')
mongo_connection_str = os.getenv('MONGO_CONNECTION_STRING')
# --- Common Configurations ---
MODEL_KWARGS = {"device": "cpu"} # สำหรับ embedding model
ENCODE_KWARGS = {"normalize_embeddings": True}
EMBEDDING_DIMENSIONS = 384
# --- Hybrid Retrieval Configuration ---
FINAL_TOP_K = int(os.getenv("FINAL_TOP_K_RERANK", 10)) #but cut rerank laew
HYBRID_FULLTEXT_PENALTY = 60
HYBRID_VECTOR_PENALTY = 60
HYBRID_TEXT_KEY = "page_content"
# --- Language-Specific Configurations ---
LANGUAGE_CONFIGS = [
{
"code": "Thai",
"model_name": "intfloat/multilingual-e5-small",
"collection_env_var": "MONGO_COLLECTION",
"vector_search_index_name": "embedding_2dsphere",
"atlas_search_index_name": "search_index_th"
},
{
"code": "English",
"model_name": "intfloat/e5-small",
"collection_env_var": "MONGO_COLLECTION_ENG",
"vector_search_index_name": "embedding_2dsphere",
"atlas_search_index_name": "search_index_eng"
},
{
"code": "Korean",
"model_name": "intfloat/multilingual-e5-small",
"collection_env_var": "MONGO_COLLECTION_KOR",
"vector_search_index_name": "embedding_2dsphere",
"atlas_search_index_name": "search_index_kor"
},
]
# Dictionaries to store initialized retrievers
final_retrievers = {}
# --- Setup Loop for Each Language ---
for config in LANGUAGE_CONFIGS:
lang_code = config["code"]
model_name = config["model_name"]
collection_name_env = config["collection_env_var"]
vector_idx_name = config["vector_search_index_name"]
atlas_search_idx_name = config["atlas_search_index_name"]
mongo_collection_name = os.getenv(collection_name_env)
if not mongo_collection_name or not mongo_database or not mongo_connection_str:
print(f"Warning: MongoDB config missing for {lang_code.upper()}. Skipping.")
continue
print(f"\n--- Setting up for {lang_code.upper()} ---")
print(f"Collection: {mongo_collection_name}, Embedding Model: {model_name}")
print(f"Vector Search Index: {vector_idx_name}, Atlas Search Index: {atlas_search_idx_name}")
try:
embed_model = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=MODEL_KWARGS,
encode_kwargs=ENCODE_KWARGS
)
namespace = f"{mongo_database}.{mongo_collection_name}"
vector_store = MongoDBAtlasVectorSearch.from_connection_string(
connection_string=mongo_connection_str,
namespace=namespace,
embedding=embed_model,
index_name=vector_idx_name,
)
retriever = MongoDBAtlasHybridSearchRetriever(
vectorstore=vector_store,
search_index_name=atlas_search_idx_name,
embedding=embed_model,
top_k=FINAL_TOP_K,
fulltext_penalty=HYBRID_FULLTEXT_PENALTY,
vector_penalty=HYBRID_VECTOR_PENALTY,
text_key=HYBRID_TEXT_KEY,
search_kwargs={"k": FINAL_TOP_K}
)
final_retrievers[lang_code] = retriever
print(f"Hybrid retriever created for {lang_code} (top {FINAL_TOP_K} documents).")
print(f"Ensure Atlas Search index '{atlas_search_idx_name}' is configured for '{mongo_collection_name}'.")
except Exception as e:
print(f"Error during setup for {lang_code.upper()}: {e}")
print(f"Please check MongoDB Atlas connection and index settings.")