Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import time
|
|
7 |
from datasets import load_dataset
|
8 |
from sentence_transformers import SentenceTransformer, util
|
9 |
|
10 |
-
# ๋ฏธ์๋ฆฐ ์ ๋ค์์ค API ํค
|
11 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
12 |
genai.configure(api_key=GEMINI_API_KEY)
|
13 |
|
@@ -18,18 +18,45 @@ model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp-1219")
|
|
18 |
# ๋ฐ์ดํฐ์
๋ถ๋ฌ์ค๊ธฐ
|
19 |
########################
|
20 |
|
21 |
-
# ๊ฑด๊ฐ
|
22 |
health_dataset = load_dataset("vinven7/PharmKG")
|
23 |
-
|
24 |
# ๋ ์ํผ ๋ฐ์ดํฐ์
|
25 |
recipe_dataset = load_dataset("AkashPS11/recipes_data_food.com")
|
26 |
-
|
27 |
# ํ๊ตญ ์์ ์ ๋ณด ๋ฐ์ดํฐ์
|
28 |
korean_food_dataset = load_dataset("SGTCho/korean_food")
|
29 |
|
30 |
# ๋ฌธ์ฅ ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋
|
31 |
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def format_chat_history(messages: list) -> list:
|
35 |
"""
|
@@ -48,31 +75,34 @@ def format_chat_history(messages: list) -> list:
|
|
48 |
|
49 |
def find_most_similar_data(query: str):
|
50 |
"""
|
51 |
-
์
๋ ฅ ์ฟผ๋ฆฌ์ ๊ฐ์ฅ ์ ์ฌํ ๋ฐ์ดํฐ๋ฅผ
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
"""
|
53 |
query_embedding = embedding_model.encode(query, convert_to_tensor=True)
|
54 |
most_similar = None
|
55 |
highest_similarity = -1
|
56 |
|
57 |
-
# ๊ฑด๊ฐ ๋ฐ์ดํฐ์
|
58 |
-
for split in
|
59 |
-
for item in
|
60 |
# ์: ๊ฑด๊ฐ ๋ฐ์ดํฐ์ ๊ตฌ์กฐ (Input, Output)๊ฐ ์๋ค๊ณ ๊ฐ์
|
61 |
if 'Input' in item and 'Output' in item:
|
62 |
item_text = f"[๊ฑด๊ฐ ์ ๋ณด]\nInput: {item['Input']} | Output: {item['Output']}"
|
63 |
item_embedding = embedding_model.encode(item_text, convert_to_tensor=True)
|
64 |
similarity = util.pytorch_cos_sim(query_embedding, item_embedding).item()
|
65 |
-
|
66 |
if similarity > highest_similarity:
|
67 |
highest_similarity = similarity
|
68 |
most_similar = item_text
|
69 |
|
70 |
-
# ๋ ์ํผ ๋ฐ์ดํฐ์
|
71 |
-
for split in
|
72 |
-
for item in
|
73 |
-
# ์ค์ ํ๋๋ dataset
|
74 |
-
# ์ฌ๊ธฐ์๋ ๊ฐ๋จํ ์์๋ก 'recipe_name', 'ingredients', 'instructions' ๋ฑ์ ํ๋๋ฅผ ๊ฐ์
|
75 |
-
# ์ค์ ๋ฐ์ดํฐ์
์๋ ๋ค๋ฅธ ํ๋๋ช
์ผ ์ ์์ผ๋ฏ๋ก ํ์ ์ ์์
|
76 |
text_components = []
|
77 |
if 'recipe_name' in item:
|
78 |
text_components.append(f"Recipe Name: {item['recipe_name']}")
|
@@ -90,10 +120,10 @@ def find_most_similar_data(query: str):
|
|
90 |
highest_similarity = similarity
|
91 |
most_similar = item_text
|
92 |
|
93 |
-
# ํ๊ตญ ์์
|
94 |
-
for split in
|
95 |
-
for item in
|
96 |
-
#
|
97 |
text_components = []
|
98 |
if 'name' in item:
|
99 |
text_components.append(f"Name: {item['name']}")
|
@@ -134,7 +164,6 @@ def stream_gemini_response(user_message: str, messages: list) -> Iterator[list]:
|
|
134 |
most_similar_data = find_most_similar_data(user_message)
|
135 |
|
136 |
# ์์คํ
๋ฉ์์ง์ ํ๋กฌํํธ ์ค์
|
137 |
-
# "MICHELIN Genesis"๋ ๊ฑด๊ฐ ๋ถ์๊ณผ ๋ ์ํผ, ๋ง์ ์ฐฝ์์ ์ธ ๊ฐ์ด๋๋ฅผ ์ ๊ณตํ๋ AI๋ก ์ค์
|
138 |
system_message = (
|
139 |
"์ ๋ ์๋ก์ด ๋ง๊ณผ ๊ฑด๊ฐ์ ์ํ ํ์ ์ ์กฐ๋ฆฌ๋ฒ์ ์ ์ํ๊ณ , "
|
140 |
"ํ๊ตญ ์์์ ๋น๋กฏํ ๋ค์ํ ๋ ์ํผ ๋ฐ์ดํฐ์ ๊ฑด๊ฐ ์ง์์ ๊ฒฐํฉํ์ฌ "
|
@@ -158,7 +187,6 @@ def stream_gemini_response(user_message: str, messages: list) -> Iterator[list]:
|
|
158 |
[๋ฐ์ดํฐ ์ฐธ๊ณ ]
|
159 |
"""
|
160 |
|
161 |
-
# ๊ด๋ จ ๋ฐ์ดํฐ๊ฐ ์์ผ๋ฉด ํจ๊ป ์ ๋ฌ
|
162 |
if most_similar_data:
|
163 |
prefixed_message = f"{system_prefix} {system_message}\n\n[๊ด๋ จ ๋ฐ์ดํฐ]\n{most_similar_data}\n\n์ฌ์ฉ์ ์ง๋ฌธ: {user_message}"
|
164 |
else:
|
@@ -220,7 +248,6 @@ def stream_gemini_response(user_message: str, messages: list) -> Iterator[list]:
|
|
220 |
role="assistant",
|
221 |
content=response_buffer
|
222 |
)
|
223 |
-
|
224 |
else:
|
225 |
# ์๊ฐ(Thinking) ์คํธ๋ฆฌ๋ฐ
|
226 |
thought_buffer += current_chunk
|
@@ -412,7 +439,7 @@ with gr.Blocks(
|
|
412 |
)
|
413 |
clear_button = gr.Button("๋ํ ์ด๊ธฐํ", scale=1)
|
414 |
|
415 |
-
# ์์ ์ง๋ฌธ๋ค
|
416 |
example_prompts = [
|
417 |
["์๋ก์ด ์ฐฝ์์ ์ธ ํ์คํ ๋ ์ํผ๋ฅผ ๋ง๋ค์ด์ฃผ์ธ์. ๊ทธ๋ฆฌ๊ณ ๊ทธ ๊ณผ์ ์์ ์ด๋ป๊ฒ ๋ง์ ์กฐํ๋ฅผ ์ด๋์ด๋ด๋์ง ์ถ๋ก ํด ์ฃผ์ธ์."],
|
418 |
["๋น๊ฑด์ฉ ํน๋ณํ ๋์ ํธ๋ฅผ ๋ง๋ค๊ณ ์ถ์ด์. ์ด์ฝ๋ฆฟ ๋์ฒด์ฌ๋ก ๋ฌด์์ ์ธ ์ ์์๊น์?"],
|
@@ -472,7 +499,6 @@ with gr.Blocks(
|
|
472 |
)
|
473 |
custom_clear_button = gr.Button("๋ํ ์ด๊ธฐํ", scale=1)
|
474 |
|
475 |
-
# ์์
|
476 |
custom_example_prompts = [
|
477 |
["๋น๋จ ํ์๋ฅผ ์ํ ์ ๋น์ง ํ์ ์๋จ ๊ณํ์ ์ธ์์ฃผ์ธ์. ๋ผ๋๋ณ ๋ฉ๋ด์ ์ฌ๋ฃ์ ์์์ ๋ณด๊ฐ ๊ถ๊ธํฉ๋๋ค."],
|
478 |
["ํน์ ์งํ(์: ์๊ถค์)์ ์ข์ ์์ ๋ ์ํผ๋ฅผ ๊ฐ๋ฐํ๊ณ ์ถ์ต๋๋ค. ์ ์๊ณผ ๊ณผํ์ ๊ทผ๊ฑฐ๋ฅผ ์ค๋ช
ํด์ฃผ์ธ์."],
|
|
|
7 |
from datasets import load_dataset
|
8 |
from sentence_transformers import SentenceTransformer, util
|
9 |
|
10 |
+
# ๋ฏธ์๋ฆฐ ์ ๋ค์์ค API ํค
|
11 |
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
|
12 |
genai.configure(api_key=GEMINI_API_KEY)
|
13 |
|
|
|
18 |
# ๋ฐ์ดํฐ์
๋ถ๋ฌ์ค๊ธฐ
|
19 |
########################
|
20 |
|
21 |
+
# ๊ฑด๊ฐ ์ ๋ณด(๊ธฐ์กด PharmKG ๋์ฒด)๋ฅผ ์ํ ๋ฐ์ดํฐ์
|
22 |
health_dataset = load_dataset("vinven7/PharmKG")
|
|
|
23 |
# ๋ ์ํผ ๋ฐ์ดํฐ์
|
24 |
recipe_dataset = load_dataset("AkashPS11/recipes_data_food.com")
|
|
|
25 |
# ํ๊ตญ ์์ ์ ๋ณด ๋ฐ์ดํฐ์
|
26 |
korean_food_dataset = load_dataset("SGTCho/korean_food")
|
27 |
|
28 |
# ๋ฌธ์ฅ ์๋ฒ ๋ฉ ๋ชจ๋ธ ๋ก๋
|
29 |
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
30 |
|
31 |
+
########################
|
32 |
+
# (์ถ๊ฐ) ๋ถ๋ถ ์ํ๋ง
|
33 |
+
########################
|
34 |
+
|
35 |
+
# health_dataset, recipe_dataset, korean_food_dataset์์ ๋๋ฌด ๋ง์ ๋ฐ์ดํฐ ์ ๋ถ๋ฅผ ์ํํ๋ฉด
|
36 |
+
# ๋งค ์ฟผ๋ฆฌ ์ ์๊ฐ์ด ์ค๋ ๊ฑธ๋ฆด ์ ์์. ํ
์คํธ๋ฅผ ์ํด ๊ฐ split์์ ์ต๋ 100๊ฐ๋ง ์ถ์ถ:
|
37 |
+
MAX_SAMPLES = 100
|
38 |
+
|
39 |
+
# ๊ฑด๊ฐ ๋ฐ์ดํฐ์
๋ถ๋ถ ์ํ
|
40 |
+
health_subset = {}
|
41 |
+
for split in health_dataset.keys():
|
42 |
+
ds_split = health_dataset[split]
|
43 |
+
sub_len = min(MAX_SAMPLES, len(ds_split))
|
44 |
+
health_subset[split] = ds_split.select(range(sub_len))
|
45 |
+
|
46 |
+
# ๋ ์ํผ ๋ฐ์ดํฐ์
๋ถ๋ถ ์ํ
|
47 |
+
recipe_subset = {}
|
48 |
+
for split in recipe_dataset.keys():
|
49 |
+
ds_split = recipe_dataset[split]
|
50 |
+
sub_len = min(MAX_SAMPLES, len(ds_split))
|
51 |
+
recipe_subset[split] = ds_split.select(range(sub_len))
|
52 |
+
|
53 |
+
# ํ๊ตญ ์์ ๋ฐ์ดํฐ์
๋ถ๋ถ ์ํ
|
54 |
+
korean_subset = {}
|
55 |
+
for split in korean_food_dataset.keys():
|
56 |
+
ds_split = korean_food_dataset[split]
|
57 |
+
sub_len = min(MAX_SAMPLES, len(ds_split))
|
58 |
+
korean_subset[split] = ds_split.select(range(sub_len))
|
59 |
+
|
60 |
|
61 |
def format_chat_history(messages: list) -> list:
|
62 |
"""
|
|
|
75 |
|
76 |
def find_most_similar_data(query: str):
|
77 |
"""
|
78 |
+
์
๋ ฅ ์ฟผ๋ฆฌ์ ๊ฐ์ฅ ์ ์ฌํ ๋ฐ์ดํฐ๋ฅผ
|
79 |
+
1) ๊ฑด๊ฐ ๋ฐ์ดํฐ์
(health_subset)
|
80 |
+
2) ๋ ์ํผ ๋ฐ์ดํฐ์
(recipe_subset)
|
81 |
+
3) ํ๊ตญ ์์ ๋ฐ์ดํฐ์
(korean_subset)
|
82 |
+
์์ ๊ฒ์.
|
83 |
+
|
84 |
+
=> ๋งค๋ฒ ์ ์ฒด๋ฅผ ์ํํ์ง ์๊ณ , ๊ฐ split์์ MAX_SAMPLES๋ง ์ ํ๋ ๋ถ๋ถ๋ง ๊ฒ์ (์ํ๋ง)
|
85 |
"""
|
86 |
query_embedding = embedding_model.encode(query, convert_to_tensor=True)
|
87 |
most_similar = None
|
88 |
highest_similarity = -1
|
89 |
|
90 |
+
# ๊ฑด๊ฐ ๋ฐ์ดํฐ์
|
91 |
+
for split in health_subset.keys():
|
92 |
+
for item in health_subset[split]:
|
93 |
# ์: ๊ฑด๊ฐ ๋ฐ์ดํฐ์ ๊ตฌ์กฐ (Input, Output)๊ฐ ์๋ค๊ณ ๊ฐ์
|
94 |
if 'Input' in item and 'Output' in item:
|
95 |
item_text = f"[๊ฑด๊ฐ ์ ๋ณด]\nInput: {item['Input']} | Output: {item['Output']}"
|
96 |
item_embedding = embedding_model.encode(item_text, convert_to_tensor=True)
|
97 |
similarity = util.pytorch_cos_sim(query_embedding, item_embedding).item()
|
|
|
98 |
if similarity > highest_similarity:
|
99 |
highest_similarity = similarity
|
100 |
most_similar = item_text
|
101 |
|
102 |
+
# ๋ ์ํผ ๋ฐ์ดํฐ์
|
103 |
+
for split in recipe_subset.keys():
|
104 |
+
for item in recipe_subset[split]:
|
105 |
+
# ์ค์ ํ๋๋ dataset ๊ตฌ์กฐ์ ๋ง์ถฐ ์กฐ์
|
|
|
|
|
106 |
text_components = []
|
107 |
if 'recipe_name' in item:
|
108 |
text_components.append(f"Recipe Name: {item['recipe_name']}")
|
|
|
120 |
highest_similarity = similarity
|
121 |
most_similar = item_text
|
122 |
|
123 |
+
# ํ๊ตญ ์์ ๋ฐ์ดํฐ์
|
124 |
+
for split in korean_subset.keys():
|
125 |
+
for item in korean_subset[split]:
|
126 |
+
# ์: name, description, recipe ํ๋ ๊ฐ์
|
127 |
text_components = []
|
128 |
if 'name' in item:
|
129 |
text_components.append(f"Name: {item['name']}")
|
|
|
164 |
most_similar_data = find_most_similar_data(user_message)
|
165 |
|
166 |
# ์์คํ
๋ฉ์์ง์ ํ๋กฌํํธ ์ค์
|
|
|
167 |
system_message = (
|
168 |
"์ ๋ ์๋ก์ด ๋ง๊ณผ ๊ฑด๊ฐ์ ์ํ ํ์ ์ ์กฐ๋ฆฌ๋ฒ์ ์ ์ํ๊ณ , "
|
169 |
"ํ๊ตญ ์์์ ๋น๋กฏํ ๋ค์ํ ๋ ์ํผ ๋ฐ์ดํฐ์ ๊ฑด๊ฐ ์ง์์ ๊ฒฐํฉํ์ฌ "
|
|
|
187 |
[๋ฐ์ดํฐ ์ฐธ๊ณ ]
|
188 |
"""
|
189 |
|
|
|
190 |
if most_similar_data:
|
191 |
prefixed_message = f"{system_prefix} {system_message}\n\n[๊ด๋ จ ๋ฐ์ดํฐ]\n{most_similar_data}\n\n์ฌ์ฉ์ ์ง๋ฌธ: {user_message}"
|
192 |
else:
|
|
|
248 |
role="assistant",
|
249 |
content=response_buffer
|
250 |
)
|
|
|
251 |
else:
|
252 |
# ์๊ฐ(Thinking) ์คํธ๋ฆฌ๋ฐ
|
253 |
thought_buffer += current_chunk
|
|
|
439 |
)
|
440 |
clear_button = gr.Button("๋ํ ์ด๊ธฐํ", scale=1)
|
441 |
|
442 |
+
# ์์ ์ง๋ฌธ๋ค
|
443 |
example_prompts = [
|
444 |
["์๋ก์ด ์ฐฝ์์ ์ธ ํ์คํ ๋ ์ํผ๋ฅผ ๋ง๋ค์ด์ฃผ์ธ์. ๊ทธ๋ฆฌ๊ณ ๊ทธ ๊ณผ์ ์์ ์ด๋ป๊ฒ ๋ง์ ์กฐํ๋ฅผ ์ด๋์ด๋ด๋์ง ์ถ๋ก ํด ์ฃผ์ธ์."],
|
445 |
["๋น๊ฑด์ฉ ํน๋ณํ ๋์ ํธ๋ฅผ ๋ง๋ค๊ณ ์ถ์ด์. ์ด์ฝ๋ฆฟ ๋์ฒด์ฌ๋ก ๋ฌด์์ ์ธ ์ ์์๊น์?"],
|
|
|
499 |
)
|
500 |
custom_clear_button = gr.Button("๋ํ ์ด๊ธฐํ", scale=1)
|
501 |
|
|
|
502 |
custom_example_prompts = [
|
503 |
["๋น๋จ ํ์๋ฅผ ์ํ ์ ๋น์ง ํ์ ์๋จ ๊ณํ์ ์ธ์์ฃผ์ธ์. ๋ผ๋๋ณ ๋ฉ๋ด์ ์ฌ๋ฃ์ ์์์ ๋ณด๊ฐ ๊ถ๊ธํฉ๋๋ค."],
|
504 |
["ํน์ ์งํ(์: ์๊ถค์)์ ์ข์ ์์ ๋ ์ํผ๋ฅผ ๊ฐ๋ฐํ๊ณ ์ถ์ต๋๋ค. ์ ์๊ณผ ๊ณผํ์ ๊ทผ๊ฑฐ๋ฅผ ์ค๋ช
ํด์ฃผ์ธ์."],
|