File size: 3,873 Bytes
e061ff7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_mongodb.vectorstores import MongoDBAtlasVectorSearch
from langchain_mongodb.retrievers.hybrid_search import MongoDBAtlasHybridSearchRetriever

load_dotenv()

mongo_username = os.getenv('MONGO_USERNAME')
mongo_password = os.getenv('MONGO_PASSWORD')
mongo_database = os.getenv('MONGO_DATABASE')
mongo_connection_str = os.getenv('MONGO_CONNECTION_STRING')

# --- Common Configurations ---
MODEL_KWARGS = {"device": "cpu"}  # สำหรับ embedding model
ENCODE_KWARGS = {"normalize_embeddings": True}
EMBEDDING_DIMENSIONS = 384

# --- Hybrid Retrieval Configuration ---
FINAL_TOP_K = int(os.getenv("FINAL_TOP_K_RERANK", 10))  #but cut rerank laew
HYBRID_FULLTEXT_PENALTY = 60
HYBRID_VECTOR_PENALTY = 60
HYBRID_TEXT_KEY = "page_content"

# --- Language-Specific Configurations ---
LANGUAGE_CONFIGS = [
    {
        "code": "Thai",
        "model_name": "intfloat/multilingual-e5-small",
        "collection_env_var": "MONGO_COLLECTION",
        "vector_search_index_name": "embedding_2dsphere",
        "atlas_search_index_name": "search_index_th"
    },
    {
        "code": "English",
        "model_name": "intfloat/e5-small",
        "collection_env_var": "MONGO_COLLECTION_ENG",
        "vector_search_index_name": "embedding_2dsphere",
        "atlas_search_index_name": "search_index_eng"
    },
    {
        "code": "Korean",
        "model_name": "intfloat/multilingual-e5-small",
        "collection_env_var": "MONGO_COLLECTION_KOR",
        "vector_search_index_name": "embedding_2dsphere",
        "atlas_search_index_name": "search_index_kor"
    },
]

# Dictionaries to store initialized retrievers
final_retrievers = {}

# --- Setup Loop for Each Language ---
for config in LANGUAGE_CONFIGS:
    lang_code = config["code"]
    model_name = config["model_name"]
    collection_name_env = config["collection_env_var"]
    vector_idx_name = config["vector_search_index_name"]
    atlas_search_idx_name = config["atlas_search_index_name"]

    mongo_collection_name = os.getenv(collection_name_env)

    if not mongo_collection_name or not mongo_database or not mongo_connection_str:
        print(f"Warning: MongoDB config missing for {lang_code.upper()}. Skipping.")
        continue

    print(f"\n--- Setting up for {lang_code.upper()} ---")
    print(f"Collection: {mongo_collection_name}, Embedding Model: {model_name}")
    print(f"Vector Search Index: {vector_idx_name}, Atlas Search Index: {atlas_search_idx_name}")

    try:
        embed_model = HuggingFaceEmbeddings(
            model_name=model_name,
            model_kwargs=MODEL_KWARGS,
            encode_kwargs=ENCODE_KWARGS
        )

        namespace = f"{mongo_database}.{mongo_collection_name}"
        vector_store = MongoDBAtlasVectorSearch.from_connection_string(
            connection_string=mongo_connection_str,
            namespace=namespace,
            embedding=embed_model,
            index_name=vector_idx_name,
        )

        retriever = MongoDBAtlasHybridSearchRetriever(
            vectorstore=vector_store,
            search_index_name=atlas_search_idx_name,
            embedding=embed_model,
            top_k=FINAL_TOP_K,
            fulltext_penalty=HYBRID_FULLTEXT_PENALTY,
            vector_penalty=HYBRID_VECTOR_PENALTY,
            text_key=HYBRID_TEXT_KEY,
            search_kwargs={"k": FINAL_TOP_K}
        )

        final_retrievers[lang_code] = retriever
        print(f"Hybrid retriever created for {lang_code} (top {FINAL_TOP_K} documents).")
        print(f"Ensure Atlas Search index '{atlas_search_idx_name}' is configured for '{mongo_collection_name}'.")

    except Exception as e:
        print(f"Error during setup for {lang_code.upper()}: {e}")
        print(f"Please check MongoDB Atlas connection and index settings.")