import os from dotenv import load_dotenv from langchain_huggingface import HuggingFaceEmbeddings from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever load_dotenv() mongo_username = os.getenv('MONGO_USERNAME') mongo_password = os.getenv('MONGO_PASSWORD') mongo_database = os.getenv('MONGO_DATABASE') mongo_connection_str = os.getenv('MONGO_CONNECTION_STRING') # --- Common Configurations --- MODEL_KWARGS = {"device": "cpu"} # สำหรับ embedding model ENCODE_KWARGS = {"normalize_embeddings": True} EMBEDDING_DIMENSIONS = 384 # --- Hybrid Retrieval Configuration --- FINAL_TOP_K = int(os.getenv("FINAL_TOP_K_RERANK", 10)) #but cut rerank laew HYBRID_FULLTEXT_PENALTY = 60 HYBRID_VECTOR_PENALTY = 60 HYBRID_TEXT_KEY = "page_content" # --- Language-Specific Configurations --- LANGUAGE_CONFIGS = [ { "code": "Thai", "model_name": "intfloat/multilingual-e5-small", "collection_env_var": "MONGO_COLLECTION", "vector_search_index_name": "embedding_2dsphere", "atlas_search_index_name": "search_index_th" }, { "code": "English", "model_name": "intfloat/e5-small", "collection_env_var": "MONGO_COLLECTION_ENG", "vector_search_index_name": "embedding_2dsphere", "atlas_search_index_name": "search_index_eng" }, { "code": "Korean", "model_name": "intfloat/multilingual-e5-small", "collection_env_var": "MONGO_COLLECTION_KOR", "vector_search_index_name": "embedding_2dsphere", "atlas_search_index_name": "search_index_kor" }, ] # Dictionaries to store initialized retrievers final_retrievers = {} # --- Setup Loop for Each Language --- for config in LANGUAGE_CONFIGS: lang_code = config["code"] model_name = config["model_name"] collection_name_env = config["collection_env_var"] vector_idx_name = config["vector_search_index_name"] atlas_search_idx_name = config["atlas_search_index_name"] mongo_collection_name = os.getenv(collection_name_env) if not mongo_collection_name or not mongo_database or not mongo_connection_str: print(f"Warning: MongoDB config missing for {lang_code.upper()}. Skipping.") continue print(f"\n--- Setting up for {lang_code.upper()} ---") print(f"Collection: {mongo_collection_name}, Embedding Model: {model_name}") print(f"Vector Search Index: {vector_idx_name}, Atlas Search Index: {atlas_search_idx_name}") try: embed_model = HuggingFaceEmbeddings( model_name=model_name, model_kwargs=MODEL_KWARGS, encode_kwargs=ENCODE_KWARGS ) namespace = f"{mongo_database}.{mongo_collection_name}" vector_store = MongoDBAtlasVectorSearch.from_connection_string( connection_string=mongo_connection_str, namespace=namespace, embedding=embed_model, index_name=vector_idx_name, ) retriever = MongoDBAtlasHybridSearchRetriever( vectorstore=vector_store, search_index_name=atlas_search_idx_name, embedding=embed_model, top_k=FINAL_TOP_K, fulltext_penalty=HYBRID_FULLTEXT_PENALTY, vector_penalty=HYBRID_VECTOR_PENALTY, text_key=HYBRID_TEXT_KEY, search_kwargs={"k": FINAL_TOP_K} ) final_retrievers[lang_code] = retriever print(f"Hybrid retriever created for {lang_code} (top {FINAL_TOP_K} documents).") print(f"Ensure Atlas Search index '{atlas_search_idx_name}' is configured for '{mongo_collection_name}'.") except Exception as e: print(f"Error during setup for {lang_code.upper()}: {e}") print(f"Please check MongoDB Atlas connection and index settings.")