|
import os |
|
from dotenv import load_dotenv |
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch |
|
from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever |
|
|
|
load_dotenv() |
|
|
|
mongo_username = os.getenv('MONGO_USERNAME') |
|
mongo_password = os.getenv('MONGO_PASSWORD') |
|
mongo_database = os.getenv('MONGO_DATABASE') |
|
mongo_connection_str = os.getenv('MONGO_CONNECTION_STRING') |
|
|
|
|
|
MODEL_KWARGS = {"device": "cpu"} |
|
ENCODE_KWARGS = {"normalize_embeddings": True} |
|
EMBEDDING_DIMENSIONS = 384 |
|
|
|
|
|
FINAL_TOP_K = int(os.getenv("FINAL_TOP_K_RERANK", 10)) |
|
HYBRID_FULLTEXT_PENALTY = 60 |
|
HYBRID_VECTOR_PENALTY = 60 |
|
HYBRID_TEXT_KEY = "page_content" |
|
|
|
|
|
LANGUAGE_CONFIGS = [ |
|
{ |
|
"code": "Thai", |
|
"model_name": "intfloat/multilingual-e5-small", |
|
"collection_env_var": "MONGO_COLLECTION", |
|
"vector_search_index_name": "embedding_2dsphere", |
|
"atlas_search_index_name": "search_index_th" |
|
}, |
|
{ |
|
"code": "English", |
|
"model_name": "intfloat/e5-small", |
|
"collection_env_var": "MONGO_COLLECTION_ENG", |
|
"vector_search_index_name": "embedding_2dsphere", |
|
"atlas_search_index_name": "search_index_eng" |
|
}, |
|
{ |
|
"code": "Korean", |
|
"model_name": "intfloat/multilingual-e5-small", |
|
"collection_env_var": "MONGO_COLLECTION_KOR", |
|
"vector_search_index_name": "embedding_2dsphere", |
|
"atlas_search_index_name": "search_index_kor" |
|
}, |
|
] |
|
|
|
|
|
final_retrievers = {} |
|
|
|
|
|
for config in LANGUAGE_CONFIGS: |
|
lang_code = config["code"] |
|
model_name = config["model_name"] |
|
collection_name_env = config["collection_env_var"] |
|
vector_idx_name = config["vector_search_index_name"] |
|
atlas_search_idx_name = config["atlas_search_index_name"] |
|
|
|
mongo_collection_name = os.getenv(collection_name_env) |
|
|
|
if not mongo_collection_name or not mongo_database or not mongo_connection_str: |
|
print(f"Warning: MongoDB config missing for {lang_code.upper()}. Skipping.") |
|
continue |
|
|
|
print(f"\n--- Setting up for {lang_code.upper()} ---") |
|
print(f"Collection: {mongo_collection_name}, Embedding Model: {model_name}") |
|
print(f"Vector Search Index: {vector_idx_name}, Atlas Search Index: {atlas_search_idx_name}") |
|
|
|
try: |
|
embed_model = HuggingFaceEmbeddings( |
|
model_name=model_name, |
|
model_kwargs=MODEL_KWARGS, |
|
encode_kwargs=ENCODE_KWARGS |
|
) |
|
|
|
namespace = f"{mongo_database}.{mongo_collection_name}" |
|
vector_store = MongoDBAtlasVectorSearch.from_connection_string( |
|
connection_string=mongo_connection_str, |
|
namespace=namespace, |
|
embedding=embed_model, |
|
index_name=vector_idx_name, |
|
) |
|
|
|
retriever = MongoDBAtlasHybridSearchRetriever( |
|
vectorstore=vector_store, |
|
search_index_name=atlas_search_idx_name, |
|
embedding=embed_model, |
|
top_k=FINAL_TOP_K, |
|
fulltext_penalty=HYBRID_FULLTEXT_PENALTY, |
|
vector_penalty=HYBRID_VECTOR_PENALTY, |
|
text_key=HYBRID_TEXT_KEY, |
|
search_kwargs={"k": FINAL_TOP_K} |
|
) |
|
|
|
final_retrievers[lang_code] = retriever |
|
print(f"Hybrid retriever created for {lang_code} (top {FINAL_TOP_K} documents).") |
|
print(f"Ensure Atlas Search index '{atlas_search_idx_name}' is configured for '{mongo_collection_name}'.") |
|
|
|
except Exception as e: |
|
print(f"Error during setup for {lang_code.upper()}: {e}") |
|
print(f"Please check MongoDB Atlas connection and index settings.") |
|
|