Spaces:
Sleeping
Sleeping
use only E5 multilingual small
Browse files
app.py
CHANGED
@@ -14,64 +14,41 @@ import pickle
|
|
14 |
import re
|
15 |
import unicodedata
|
16 |
|
17 |
-
|
18 |
qdrant_client = QdrantClient(
|
19 |
url=os.environ.get("Qdrant_url"),
|
20 |
api_key=os.environ.get("Qdrant_api"),
|
21 |
)
|
22 |
|
|
|
23 |
AIRTABLE_API_KEY = os.environ.get("airtable_api")
|
24 |
BASE_ID = os.environ.get("airtable_baseid")
|
25 |
-
TABLE_NAME = "Feedback_search"
|
26 |
api = Api(AIRTABLE_API_KEY)
|
27 |
table = api.table(BASE_ID, TABLE_NAME)
|
28 |
|
29 |
-
#
|
30 |
-
|
31 |
-
|
32 |
-
"E5 large instruct (multilingual-e5-large-instruct)": SentenceTransformer("intfloat/multilingual-e5-large-instruct"),
|
33 |
-
"Kalm (KaLM-embedding-multilingual-mini-v1)": SentenceTransformer('HIT-TMG/KaLM-embedding-multilingual-mini-v1')
|
34 |
-
}
|
35 |
-
|
36 |
-
model_config = {
|
37 |
-
"E5 (intfloat/multilingual-e5-small)": {
|
38 |
-
"func": lambda query: models["E5 (intfloat/multilingual-e5-small)"].encode("query: " + query),
|
39 |
-
"collection": "product_E5",
|
40 |
-
},
|
41 |
-
"E5 large instruct (multilingual-e5-large-instruct)": {
|
42 |
-
"func": lambda query: models["E5 large instruct (multilingual-e5-large-instruct)"].encode(
|
43 |
-
"Instruct: Given a product search query, retrieve relevant product listings\nQuery: " + query, convert_to_tensor=False, normalize_embeddings=True),
|
44 |
-
"collection": "product_E5_large_instruct",
|
45 |
-
},
|
46 |
-
"Kalm (KaLM-embedding-multilingual-mini-v1)": {
|
47 |
-
"func": lambda query: models["Kalm (KaLM-embedding-multilingual-mini-v1)"].encode(query, normalize_embeddings=True),
|
48 |
-
"collection": "product_kalm",
|
49 |
-
}
|
50 |
-
}
|
51 |
-
|
52 |
-
# Global memory to hold feedback state
|
53 |
-
latest_query_result = {"query": "", "result": "", "model": "", "raw_query": "", "time": ""}
|
54 |
|
|
|
55 |
with open("keyword_whitelist.pkl", "rb") as f:
|
56 |
keyword_whitelist = pickle.load(f)
|
57 |
|
|
|
58 |
def normalize(text: str) -> str:
|
59 |
text = unicodedata.normalize("NFC", text)
|
60 |
-
|
61 |
-
return text.strip().lower()
|
62 |
|
63 |
def smart_tokenize(text: str) -> list:
|
64 |
tokens = word_tokenize(text.strip(), engine="newmm")
|
65 |
-
if
|
66 |
-
return [text.strip()]
|
67 |
-
return tokens
|
68 |
|
69 |
def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3):
|
70 |
query_norm = normalize(query)
|
71 |
tokens = smart_tokenize(query_norm)
|
72 |
corrected = []
|
73 |
i = 0
|
74 |
-
|
75 |
while i < len(tokens):
|
76 |
matched = False
|
77 |
for n in range(min(max_ngram, len(tokens) - i), 0, -1):
|
@@ -85,22 +62,17 @@ def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3
|
|
85 |
if not matched:
|
86 |
corrected.append(tokens[i])
|
87 |
i += 1
|
|
|
88 |
|
89 |
-
|
90 |
-
|
91 |
-
return "".join(cleaned)
|
92 |
|
93 |
-
#
|
94 |
-
def search_product(query
|
95 |
start_time = time.time()
|
96 |
-
if model_name not in model_config:
|
97 |
-
return "<p>❌ ไม่พบโมเดล</p>"
|
98 |
-
|
99 |
latest_query_result["raw_query"] = query
|
100 |
-
corrected_query = correct_query_merge_phrases(query,keyword_whitelist)
|
101 |
-
|
102 |
-
query_embed = model_config[model_name]["func"](corrected_query)
|
103 |
-
collection_name = model_config[model_name]["collection"]
|
104 |
|
105 |
try:
|
106 |
result = qdrant_client.query_points(
|
@@ -118,60 +90,52 @@ def search_product(query, model_name):
|
|
118 |
if corrected_query != query:
|
119 |
html_output += f"<p>🔧 แก้คำค้นจาก: <code>{query}</code> → <code>{corrected_query}</code></p>"
|
120 |
|
121 |
-
html_output += ""
|
122 |
-
<div style="display: grid; grid-template-columns: repeat(auto-fill, minmax(220px, 1fr)); gap: 20px;">
|
123 |
-
"""
|
124 |
|
125 |
-
result_summary = ""
|
126 |
-
found = False
|
127 |
for res in result:
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
|
|
144 |
</div>
|
145 |
-
|
146 |
-
|
147 |
-
result_summary += f"{name} (score: {score}) | "
|
148 |
|
149 |
html_output += "</div>"
|
150 |
|
151 |
if not found:
|
152 |
-
|
153 |
-
|
154 |
-
❌ ไม่พบสินค้าที่เกี่ยวข้องกับคำค้นนี้
|
155 |
-
</div>
|
156 |
-
"""
|
157 |
-
return html_output
|
158 |
-
|
159 |
-
latest_query_result["query"] = corrected_query
|
160 |
-
latest_query_result["result"] = result_summary.strip()
|
161 |
-
latest_query_result["model"] = model_name
|
162 |
-
latest_query_result["time"] = elapsed
|
163 |
|
164 |
-
|
|
|
|
|
|
|
|
|
165 |
|
|
|
166 |
|
167 |
-
#
|
168 |
def log_feedback(feedback):
|
169 |
try:
|
170 |
now = datetime.now().strftime("%Y-%m-%d")
|
171 |
table.create({
|
172 |
"timestamp": now,
|
173 |
"raw_query": latest_query_result["raw_query"],
|
174 |
-
"model": latest_query_result["model"],
|
175 |
"query": latest_query_result["query"],
|
176 |
"result": latest_query_result["result"],
|
177 |
"time(second)": latest_query_result["time"],
|
@@ -181,20 +145,12 @@ def log_feedback(feedback):
|
|
181 |
except Exception as e:
|
182 |
return f"❌ Failed to save feedback: {str(e)}"
|
183 |
|
184 |
-
|
185 |
-
# 🎨 Gradio UI
|
186 |
with gr.Blocks() as demo:
|
187 |
gr.Markdown("## 🔎 Product Semantic Search (Vector Search + Qdrant)")
|
188 |
|
189 |
-
|
190 |
-
|
191 |
-
choices=list(models.keys()),
|
192 |
-
label="เลือกโมเดล",
|
193 |
-
value="E5 (intfloat/multilingual-e5-small)"
|
194 |
-
)
|
195 |
-
query_input = gr.Textbox(label="พิมพ์คำค้นหา")
|
196 |
-
|
197 |
-
result_output = gr.HTML(label="📋 ผลลัพธ์") # HTML แสดงผลลัพธ์พร้อมรูป
|
198 |
|
199 |
with gr.Row():
|
200 |
match_btn = gr.Button("✅ ตรง")
|
@@ -202,9 +158,9 @@ with gr.Blocks() as demo:
|
|
202 |
|
203 |
feedback_status = gr.Textbox(label="📬 สถานะ Feedback")
|
204 |
|
205 |
-
query_input.submit(search_product, inputs=[query_input
|
206 |
match_btn.click(lambda: log_feedback("match"), outputs=feedback_status)
|
207 |
not_match_btn.click(lambda: log_feedback("not_match"), outputs=feedback_status)
|
208 |
|
209 |
-
# Run
|
210 |
-
demo.launch(share=True)
|
|
|
14 |
import re
|
15 |
import unicodedata
|
16 |
|
17 |
+
# Setup Qdrant Client
|
18 |
qdrant_client = QdrantClient(
|
19 |
url=os.environ.get("Qdrant_url"),
|
20 |
api_key=os.environ.get("Qdrant_api"),
|
21 |
)
|
22 |
|
23 |
+
# Airtable Config
|
24 |
AIRTABLE_API_KEY = os.environ.get("airtable_api")
|
25 |
BASE_ID = os.environ.get("airtable_baseid")
|
26 |
+
TABLE_NAME = "Feedback_search"
|
27 |
api = Api(AIRTABLE_API_KEY)
|
28 |
table = api.table(BASE_ID, TABLE_NAME)
|
29 |
|
30 |
+
# Load model
|
31 |
+
model = SentenceTransformer('intfloat/multilingual-e5-small')
|
32 |
+
collection_name = "product_E5"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
# Load whitelist
|
35 |
with open("keyword_whitelist.pkl", "rb") as f:
|
36 |
keyword_whitelist = pickle.load(f)
|
37 |
|
38 |
+
# Utils
|
39 |
def normalize(text: str) -> str:
|
40 |
text = unicodedata.normalize("NFC", text)
|
41 |
+
return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
|
|
|
42 |
|
43 |
def smart_tokenize(text: str) -> list:
|
44 |
tokens = word_tokenize(text.strip(), engine="newmm")
|
45 |
+
return tokens if tokens and len("".join(tokens)) >= len(text.strip()) * 0.5 else [text.strip()]
|
|
|
|
|
46 |
|
47 |
def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3):
|
48 |
query_norm = normalize(query)
|
49 |
tokens = smart_tokenize(query_norm)
|
50 |
corrected = []
|
51 |
i = 0
|
|
|
52 |
while i < len(tokens):
|
53 |
matched = False
|
54 |
for n in range(min(max_ngram, len(tokens) - i), 0, -1):
|
|
|
62 |
if not matched:
|
63 |
corrected.append(tokens[i])
|
64 |
i += 1
|
65 |
+
return "".join([word for word in corrected if len(word) > 1 or word in whitelist])
|
66 |
|
67 |
+
# Global state
|
68 |
+
latest_query_result = {"query": "", "result": "", "raw_query": "", "time": ""}
|
|
|
69 |
|
70 |
+
# Main Search
|
71 |
+
def search_product(query):
|
72 |
start_time = time.time()
|
|
|
|
|
|
|
73 |
latest_query_result["raw_query"] = query
|
74 |
+
corrected_query = correct_query_merge_phrases(query, keyword_whitelist)
|
75 |
+
query_embed = model.encode("query: " + corrected_query)
|
|
|
|
|
76 |
|
77 |
try:
|
78 |
result = qdrant_client.query_points(
|
|
|
90 |
if corrected_query != query:
|
91 |
html_output += f"<p>🔧 แก้คำค้นจาก: <code>{query}</code> → <code>{corrected_query}</code></p>"
|
92 |
|
93 |
+
html_output += '<div style="display: grid; grid-template-columns: repeat(auto-fill, minmax(220px, 1fr)); gap: 20px;">'
|
|
|
|
|
94 |
|
95 |
+
result_summary, found = "", False
|
|
|
96 |
for res in result:
|
97 |
+
if res.score > 0.8:
|
98 |
+
found = True
|
99 |
+
name = res.payload.get("name", "ไม่ทราบชื่อสินค้า")
|
100 |
+
score = f"{res.score:.4f}"
|
101 |
+
img_url = res.payload.get("imageUrl", "")
|
102 |
+
price = res.payload.get("price", "ไม่ระบุ")
|
103 |
+
brand = res.payload.get("brand", "")
|
104 |
+
|
105 |
+
html_output += f"""
|
106 |
+
<div style="border: 1px solid #ddd; border-radius: 8px; padding: 10px; text-align: center; box-shadow: 1px 1px 5px rgba(0,0,0,0.1); background: #fff;">
|
107 |
+
<img src="{img_url}" style="width: 100%; max-height: 150px; object-fit: contain; border-radius: 4px;">
|
108 |
+
<div style="margin-top: 10px;">
|
109 |
+
<div style="font-weight: bold; font-size: 14px;">{name}</div>
|
110 |
+
<div style="color: gray; font-size: 12px;">{brand}</div>
|
111 |
+
<div style="color: green; margin: 4px 0;">฿{price}</div>
|
112 |
+
<div style="font-size: 12px; color: #555;">score: {score}</div>
|
113 |
+
</div>
|
114 |
</div>
|
115 |
+
"""
|
116 |
+
result_summary += f"{name} (score: {score}) | "
|
|
|
117 |
|
118 |
html_output += "</div>"
|
119 |
|
120 |
if not found:
|
121 |
+
html_output += '<div style="text-align: center; font-size: 18px; color: #a00; padding: 30px;">❌ ไม่พบสินค้าที่เกี่ยวข้องกับคำค้นนี้</div>'
|
122 |
+
return html_output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
+
latest_query_result.update({
|
125 |
+
"query": corrected_query,
|
126 |
+
"result": result_summary.strip(),
|
127 |
+
"time": elapsed,
|
128 |
+
})
|
129 |
|
130 |
+
return html_output
|
131 |
|
132 |
+
# Feedback logging
|
133 |
def log_feedback(feedback):
|
134 |
try:
|
135 |
now = datetime.now().strftime("%Y-%m-%d")
|
136 |
table.create({
|
137 |
"timestamp": now,
|
138 |
"raw_query": latest_query_result["raw_query"],
|
|
|
139 |
"query": latest_query_result["query"],
|
140 |
"result": latest_query_result["result"],
|
141 |
"time(second)": latest_query_result["time"],
|
|
|
145 |
except Exception as e:
|
146 |
return f"❌ Failed to save feedback: {str(e)}"
|
147 |
|
148 |
+
# Gradio UI
|
|
|
149 |
with gr.Blocks() as demo:
|
150 |
gr.Markdown("## 🔎 Product Semantic Search (Vector Search + Qdrant)")
|
151 |
|
152 |
+
query_input = gr.Textbox(label="พิมพ์คำค้นหา")
|
153 |
+
result_output = gr.HTML(label="📋 ผลลัพธ์")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
with gr.Row():
|
156 |
match_btn = gr.Button("✅ ตรง")
|
|
|
158 |
|
159 |
feedback_status = gr.Textbox(label="📬 สถานะ Feedback")
|
160 |
|
161 |
+
query_input.submit(search_product, inputs=[query_input], outputs=result_output)
|
162 |
match_btn.click(lambda: log_feedback("match"), outputs=feedback_status)
|
163 |
not_match_btn.click(lambda: log_feedback("not_match"), outputs=feedback_status)
|
164 |
|
165 |
+
# Run
|
166 |
+
demo.launch(share=True)
|