immunobiotech commited on
Commit
1647f44
ยท
verified ยท
1 Parent(s): 3ffd4e5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -24
app.py CHANGED
@@ -7,7 +7,7 @@ import time
7
  from datasets import load_dataset
8
  from sentence_transformers import SentenceTransformer, util
9
 
10
- # ๋ฏธ์‰๋ฆฐ ์ œ๋„ค์‹œ์Šค API ํ‚ค(๊ธฐ์กด GEMINI_API_KEY ์‚ฌ์šฉ, ํ•„์š” ์‹œ ํ™˜๊ฒฝ ๋ณ€์ˆ˜๋ช… ์ˆ˜์ • ๊ฐ€๋Šฅ)
11
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
12
  genai.configure(api_key=GEMINI_API_KEY)
13
 
@@ -18,18 +18,45 @@ model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp-1219")
18
  # ๋ฐ์ดํ„ฐ์…‹ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
19
  ########################
20
 
21
- # ๊ฑด๊ฐ• ๊ด€๋ จ ์ง€์‹ ๊ทธ๋ž˜ํ”„(๊ธฐ์กด PharmKG๋ฅผ ํ™œ์šฉํ•˜์—ฌ ๊ฑด๊ฐ• ๋ถ„์„์„ ์œ„ํ•œ ๋ฐ์ดํ„ฐ์…‹ ์˜ˆ์‹œ)
22
  health_dataset = load_dataset("vinven7/PharmKG")
23
-
24
  # ๋ ˆ์‹œํ”ผ ๋ฐ์ดํ„ฐ์…‹
25
  recipe_dataset = load_dataset("AkashPS11/recipes_data_food.com")
26
-
27
  # ํ•œ๊ตญ ์Œ์‹ ์ •๋ณด ๋ฐ์ดํ„ฐ์…‹
28
  korean_food_dataset = load_dataset("SGTCho/korean_food")
29
 
30
  # ๋ฌธ์žฅ ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ
31
  embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def format_chat_history(messages: list) -> list:
35
  """
@@ -48,31 +75,34 @@ def format_chat_history(messages: list) -> list:
48
 
49
  def find_most_similar_data(query: str):
50
  """
51
- ์ž…๋ ฅ ์ฟผ๋ฆฌ์— ๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ฐ์ดํ„ฐ๋ฅผ ์„ธ ๊ฐ€์ง€ ๋ฐ์ดํ„ฐ์…‹(๊ฑด๊ฐ•, ๋ ˆ์‹œํ”ผ, ํ•œ๊ตญ ์Œ์‹)์—์„œ ๊ฒ€์ƒ‰
 
 
 
 
 
 
52
  """
53
  query_embedding = embedding_model.encode(query, convert_to_tensor=True)
54
  most_similar = None
55
  highest_similarity = -1
56
 
57
- # ๊ฑด๊ฐ• ๋ฐ์ดํ„ฐ์…‹(์˜› PharmKG) ๊ฒ€์ƒ‰
58
- for split in health_dataset.keys():
59
- for item in health_dataset[split]:
60
  # ์˜ˆ: ๊ฑด๊ฐ• ๋ฐ์ดํ„ฐ์˜ ๊ตฌ์กฐ (Input, Output)๊ฐ€ ์žˆ๋‹ค๊ณ  ๊ฐ€์ •
61
  if 'Input' in item and 'Output' in item:
62
  item_text = f"[๊ฑด๊ฐ• ์ •๋ณด]\nInput: {item['Input']} | Output: {item['Output']}"
63
  item_embedding = embedding_model.encode(item_text, convert_to_tensor=True)
64
  similarity = util.pytorch_cos_sim(query_embedding, item_embedding).item()
65
-
66
  if similarity > highest_similarity:
67
  highest_similarity = similarity
68
  most_similar = item_text
69
 
70
- # ๋ ˆ์‹œํ”ผ ๋ฐ์ดํ„ฐ์…‹ ๊ฒ€์ƒ‰
71
- for split in recipe_dataset.keys():
72
- for item in recipe_dataset[split]:
73
- # ์‹ค์ œ ํ•„๋“œ๋Š” dataset ๊ตฌ์กฐ๋ฅผ ํ™•์ธ ํ›„ ์ ์ ˆํžˆ ์ˆ˜์ •ํ•ด์•ผ ํ•จ (์˜ˆ: title, steps, ingredients ๋“ฑ)
74
- # ์—ฌ๊ธฐ์„œ๋Š” ๊ฐ„๋‹จํžˆ ์˜ˆ์‹œ๋กœ 'recipe_name', 'ingredients', 'instructions' ๋“ฑ์˜ ํ•„๋“œ๋ฅผ ๊ฐ€์ •
75
- # ์‹ค์ œ ๋ฐ์ดํ„ฐ์…‹์—๋Š” ๋‹ค๋ฅธ ํ•„๋“œ๋ช…์ผ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ ํ•„์š” ์‹œ ์ˆ˜์ •
76
  text_components = []
77
  if 'recipe_name' in item:
78
  text_components.append(f"Recipe Name: {item['recipe_name']}")
@@ -90,10 +120,10 @@ def find_most_similar_data(query: str):
90
  highest_similarity = similarity
91
  most_similar = item_text
92
 
93
- # ํ•œ๊ตญ ์Œ์‹ ์ •๋ณด ๋ฐ์ดํ„ฐ์…‹ ๊ฒ€์ƒ‰
94
- for split in korean_food_dataset.keys():
95
- for item in korean_food_dataset[split]:
96
- # ์˜ˆ์‹œ: ํ•œ๊ตญ ์Œ์‹ ๋ฐ์ดํ„ฐ์—๋„ name, description, ingredients, recipe ๋“ฑ์ด ์žˆ์„ ๊ฒƒ์œผ๋กœ ์ถ”์ •
97
  text_components = []
98
  if 'name' in item:
99
  text_components.append(f"Name: {item['name']}")
@@ -134,7 +164,6 @@ def stream_gemini_response(user_message: str, messages: list) -> Iterator[list]:
134
  most_similar_data = find_most_similar_data(user_message)
135
 
136
  # ์‹œ์Šคํ…œ ๋ฉ”์‹œ์ง€์™€ ํ”„๋กฌํ”„ํŠธ ์„ค์ •
137
- # "MICHELIN Genesis"๋Š” ๊ฑด๊ฐ• ๋ถ„์„๊ณผ ๋ ˆ์‹œํ”ผ, ๋ง›์˜ ์ฐฝ์˜์ ์ธ ๊ฐ€์ด๋“œ๋ฅผ ์ œ๊ณตํ•˜๋Š” AI๋กœ ์„ค์ •
138
  system_message = (
139
  "์ €๋Š” ์ƒˆ๋กœ์šด ๋ง›๊ณผ ๊ฑด๊ฐ•์„ ์œ„ํ•œ ํ˜์‹ ์  ์กฐ๋ฆฌ๋ฒ•์„ ์ œ์‹œํ•˜๊ณ , "
140
  "ํ•œ๊ตญ ์Œ์‹์„ ๋น„๋กฏํ•œ ๋‹ค์–‘ํ•œ ๋ ˆ์‹œํ”ผ ๋ฐ์ดํ„ฐ์™€ ๊ฑด๊ฐ• ์ง€์‹์„ ๊ฒฐํ•ฉํ•˜์—ฌ "
@@ -158,7 +187,6 @@ def stream_gemini_response(user_message: str, messages: list) -> Iterator[list]:
158
  [๋ฐ์ดํ„ฐ ์ฐธ๊ณ ]
159
  """
160
 
161
- # ๊ด€๋ จ ๋ฐ์ดํ„ฐ๊ฐ€ ์žˆ์œผ๋ฉด ํ•จ๊ป˜ ์ „๋‹ฌ
162
  if most_similar_data:
163
  prefixed_message = f"{system_prefix} {system_message}\n\n[๊ด€๋ จ ๋ฐ์ดํ„ฐ]\n{most_similar_data}\n\n์‚ฌ์šฉ์ž ์งˆ๋ฌธ: {user_message}"
164
  else:
@@ -220,7 +248,6 @@ def stream_gemini_response(user_message: str, messages: list) -> Iterator[list]:
220
  role="assistant",
221
  content=response_buffer
222
  )
223
-
224
  else:
225
  # ์ƒ๊ฐ(Thinking) ์ŠคํŠธ๋ฆฌ๋ฐ
226
  thought_buffer += current_chunk
@@ -412,7 +439,7 @@ with gr.Blocks(
412
  )
413
  clear_button = gr.Button("๋Œ€ํ™” ์ดˆ๊ธฐํ™”", scale=1)
414
 
415
- # ์˜ˆ์‹œ ์งˆ๋ฌธ๋“ค ์ˆ˜์ •
416
  example_prompts = [
417
  ["์ƒˆ๋กœ์šด ์ฐฝ์˜์ ์ธ ํŒŒ์Šคํƒ€ ๋ ˆ์‹œํ”ผ๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์„ธ์š”. ๊ทธ๋ฆฌ๊ณ  ๊ทธ ๊ณผ์ •์—์„œ ์–ด๋–ป๊ฒŒ ๋ง›์˜ ์กฐํ™”๋ฅผ ์ด๋Œ์–ด๋‚ด๋Š”์ง€ ์ถ”๋ก ํ•ด ์ฃผ์„ธ์š”."],
418
  ["๋น„๊ฑด์šฉ ํŠน๋ณ„ํ•œ ๋””์ €ํŠธ๋ฅผ ๋งŒ๋“ค๊ณ  ์‹ถ์–ด์š”. ์ดˆ์ฝœ๋ฆฟ ๋Œ€์ฒด์žฌ๋กœ ๋ฌด์—‡์„ ์“ธ ์ˆ˜ ์žˆ์„๊นŒ์š”?"],
@@ -472,7 +499,6 @@ with gr.Blocks(
472
  )
473
  custom_clear_button = gr.Button("๋Œ€ํ™” ์ดˆ๊ธฐํ™”", scale=1)
474
 
475
- # ์˜ˆ์‹œ
476
  custom_example_prompts = [
477
  ["๋‹น๋‡จ ํ™˜์ž๋ฅผ ์œ„ํ•œ ์ €๋‹น์งˆ ํ•œ์‹ ์‹๋‹จ ๊ณ„ํš์„ ์„ธ์›Œ์ฃผ์„ธ์š”. ๋ผ๋‹ˆ๋ณ„ ๋ฉ”๋‰ด์™€ ์žฌ๋ฃŒ์˜ ์˜์–‘์ •๋ณด๊ฐ€ ๊ถ๊ธˆํ•ฉ๋‹ˆ๋‹ค."],
478
  ["ํŠน์ • ์งˆํ™˜(์˜ˆ: ์œ„๊ถค์–‘)์— ์ข‹์€ ์–‘์‹ ๋ ˆ์‹œํ”ผ๋ฅผ ๊ฐœ๋ฐœํ•˜๊ณ  ์‹ถ์Šต๋‹ˆ๋‹ค. ์ œ์•ˆ๊ณผ ๊ณผํ•™์  ๊ทผ๊ฑฐ๋ฅผ ์„ค๋ช…ํ•ด์ฃผ์„ธ์š”."],
 
7
  from datasets import load_dataset
8
  from sentence_transformers import SentenceTransformer, util
9
 
10
+ # ๋ฏธ์‰๋ฆฐ ์ œ๋„ค์‹œ์Šค API ํ‚ค
11
  GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
12
  genai.configure(api_key=GEMINI_API_KEY)
13
 
 
18
  # ๋ฐ์ดํ„ฐ์…‹ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ
19
  ########################
20
 
21
+ # ๊ฑด๊ฐ• ์ •๋ณด(๊ธฐ์กด PharmKG ๋Œ€์ฒด)๋ฅผ ์œ„ํ•œ ๋ฐ์ดํ„ฐ์…‹
22
  health_dataset = load_dataset("vinven7/PharmKG")
 
23
  # ๋ ˆ์‹œํ”ผ ๋ฐ์ดํ„ฐ์…‹
24
  recipe_dataset = load_dataset("AkashPS11/recipes_data_food.com")
 
25
  # ํ•œ๊ตญ ์Œ์‹ ์ •๋ณด ๋ฐ์ดํ„ฐ์…‹
26
  korean_food_dataset = load_dataset("SGTCho/korean_food")
27
 
28
  # ๋ฌธ์žฅ ์ž„๋ฒ ๋”ฉ ๋ชจ๋ธ ๋กœ๋“œ
29
  embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
30
 
31
+ ########################
32
+ # (์ถ”๊ฐ€) ๋ถ€๋ถ„ ์ƒ˜ํ”Œ๋ง
33
+ ########################
34
+
35
+ # health_dataset, recipe_dataset, korean_food_dataset์—์„œ ๋„ˆ๋ฌด ๋งŽ์€ ๋ฐ์ดํ„ฐ ์ „๋ถ€๋ฅผ ์ˆœํšŒํ•˜๋ฉด
36
+ # ๋งค ์ฟผ๋ฆฌ ์‹œ ์‹œ๊ฐ„์ด ์˜ค๋ž˜ ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Œ. ํ…Œ์ŠคํŠธ๋ฅผ ์œ„ํ•ด ๊ฐ split์—์„œ ์ตœ๋Œ€ 100๊ฐœ๋งŒ ์ถ”์ถœ:
37
+ MAX_SAMPLES = 100
38
+
39
+ # ๊ฑด๊ฐ• ๋ฐ์ดํ„ฐ์…‹ ๋ถ€๋ถ„ ์ƒ˜ํ”Œ
40
+ health_subset = {}
41
+ for split in health_dataset.keys():
42
+ ds_split = health_dataset[split]
43
+ sub_len = min(MAX_SAMPLES, len(ds_split))
44
+ health_subset[split] = ds_split.select(range(sub_len))
45
+
46
+ # ๋ ˆ์‹œํ”ผ ๋ฐ์ดํ„ฐ์…‹ ๋ถ€๋ถ„ ์ƒ˜ํ”Œ
47
+ recipe_subset = {}
48
+ for split in recipe_dataset.keys():
49
+ ds_split = recipe_dataset[split]
50
+ sub_len = min(MAX_SAMPLES, len(ds_split))
51
+ recipe_subset[split] = ds_split.select(range(sub_len))
52
+
53
+ # ํ•œ๊ตญ ์Œ์‹ ๋ฐ์ดํ„ฐ์…‹ ๋ถ€๋ถ„ ์ƒ˜ํ”Œ
54
+ korean_subset = {}
55
+ for split in korean_food_dataset.keys():
56
+ ds_split = korean_food_dataset[split]
57
+ sub_len = min(MAX_SAMPLES, len(ds_split))
58
+ korean_subset[split] = ds_split.select(range(sub_len))
59
+
60
 
61
  def format_chat_history(messages: list) -> list:
62
  """
 
75
 
76
  def find_most_similar_data(query: str):
77
  """
78
+ ์ž…๋ ฅ ์ฟผ๋ฆฌ์— ๊ฐ€์žฅ ์œ ์‚ฌํ•œ ๋ฐ์ดํ„ฐ๋ฅผ
79
+ 1) ๊ฑด๊ฐ• ๋ฐ์ดํ„ฐ์…‹ (health_subset)
80
+ 2) ๋ ˆ์‹œํ”ผ ๋ฐ์ดํ„ฐ์…‹ (recipe_subset)
81
+ 3) ํ•œ๊ตญ ์Œ์‹ ๋ฐ์ดํ„ฐ์…‹ (korean_subset)
82
+ ์—์„œ ๊ฒ€์ƒ‰.
83
+
84
+ => ๋งค๋ฒˆ ์ „์ฒด๋ฅผ ์ˆœํšŒํ•˜์ง€ ์•Š๊ณ , ๊ฐ split์—์„œ MAX_SAMPLES๋งŒ ์„ ํƒ๋œ ๋ถ€๋ถ„๋งŒ ๊ฒ€์ƒ‰ (์ƒ˜ํ”Œ๋ง)
85
  """
86
  query_embedding = embedding_model.encode(query, convert_to_tensor=True)
87
  most_similar = None
88
  highest_similarity = -1
89
 
90
+ # ๊ฑด๊ฐ• ๋ฐ์ดํ„ฐ์…‹
91
+ for split in health_subset.keys():
92
+ for item in health_subset[split]:
93
  # ์˜ˆ: ๊ฑด๊ฐ• ๋ฐ์ดํ„ฐ์˜ ๊ตฌ์กฐ (Input, Output)๊ฐ€ ์žˆ๋‹ค๊ณ  ๊ฐ€์ •
94
  if 'Input' in item and 'Output' in item:
95
  item_text = f"[๊ฑด๊ฐ• ์ •๋ณด]\nInput: {item['Input']} | Output: {item['Output']}"
96
  item_embedding = embedding_model.encode(item_text, convert_to_tensor=True)
97
  similarity = util.pytorch_cos_sim(query_embedding, item_embedding).item()
 
98
  if similarity > highest_similarity:
99
  highest_similarity = similarity
100
  most_similar = item_text
101
 
102
+ # ๋ ˆ์‹œํ”ผ ๋ฐ์ดํ„ฐ์…‹
103
+ for split in recipe_subset.keys():
104
+ for item in recipe_subset[split]:
105
+ # ์‹ค์ œ ํ•„๋“œ๋Š” dataset ๊ตฌ์กฐ์— ๋งž์ถฐ ์กฐ์ •
 
 
106
  text_components = []
107
  if 'recipe_name' in item:
108
  text_components.append(f"Recipe Name: {item['recipe_name']}")
 
120
  highest_similarity = similarity
121
  most_similar = item_text
122
 
123
+ # ํ•œ๊ตญ ์Œ์‹ ๋ฐ์ดํ„ฐ์…‹
124
+ for split in korean_subset.keys():
125
+ for item in korean_subset[split]:
126
+ # ์˜ˆ: name, description, recipe ํ•„๋“œ ๊ฐ€์ •
127
  text_components = []
128
  if 'name' in item:
129
  text_components.append(f"Name: {item['name']}")
 
164
  most_similar_data = find_most_similar_data(user_message)
165
 
166
  # ์‹œ์Šคํ…œ ๋ฉ”์‹œ์ง€์™€ ํ”„๋กฌํ”„ํŠธ ์„ค์ •
 
167
  system_message = (
168
  "์ €๋Š” ์ƒˆ๋กœ์šด ๋ง›๊ณผ ๊ฑด๊ฐ•์„ ์œ„ํ•œ ํ˜์‹ ์  ์กฐ๋ฆฌ๋ฒ•์„ ์ œ์‹œํ•˜๊ณ , "
169
  "ํ•œ๊ตญ ์Œ์‹์„ ๋น„๋กฏํ•œ ๋‹ค์–‘ํ•œ ๋ ˆ์‹œํ”ผ ๋ฐ์ดํ„ฐ์™€ ๊ฑด๊ฐ• ์ง€์‹์„ ๊ฒฐํ•ฉํ•˜์—ฌ "
 
187
  [๋ฐ์ดํ„ฐ ์ฐธ๊ณ ]
188
  """
189
 
 
190
  if most_similar_data:
191
  prefixed_message = f"{system_prefix} {system_message}\n\n[๊ด€๋ จ ๋ฐ์ดํ„ฐ]\n{most_similar_data}\n\n์‚ฌ์šฉ์ž ์งˆ๋ฌธ: {user_message}"
192
  else:
 
248
  role="assistant",
249
  content=response_buffer
250
  )
 
251
  else:
252
  # ์ƒ๊ฐ(Thinking) ์ŠคํŠธ๋ฆฌ๋ฐ
253
  thought_buffer += current_chunk
 
439
  )
440
  clear_button = gr.Button("๋Œ€ํ™” ์ดˆ๊ธฐํ™”", scale=1)
441
 
442
+ # ์˜ˆ์‹œ ์งˆ๋ฌธ๋“ค
443
  example_prompts = [
444
  ["์ƒˆ๋กœ์šด ์ฐฝ์˜์ ์ธ ํŒŒ์Šคํƒ€ ๋ ˆ์‹œํ”ผ๋ฅผ ๋งŒ๋“ค์–ด์ฃผ์„ธ์š”. ๊ทธ๋ฆฌ๊ณ  ๊ทธ ๊ณผ์ •์—์„œ ์–ด๋–ป๊ฒŒ ๋ง›์˜ ์กฐํ™”๋ฅผ ์ด๋Œ์–ด๋‚ด๋Š”์ง€ ์ถ”๋ก ํ•ด ์ฃผ์„ธ์š”."],
445
  ["๋น„๊ฑด์šฉ ํŠน๋ณ„ํ•œ ๋””์ €ํŠธ๋ฅผ ๋งŒ๋“ค๊ณ  ์‹ถ์–ด์š”. ์ดˆ์ฝœ๋ฆฟ ๋Œ€์ฒด์žฌ๋กœ ๋ฌด์—‡์„ ์“ธ ์ˆ˜ ์žˆ์„๊นŒ์š”?"],
 
499
  )
500
  custom_clear_button = gr.Button("๋Œ€ํ™” ์ดˆ๊ธฐํ™”", scale=1)
501
 
 
502
  custom_example_prompts = [
503
  ["๋‹น๋‡จ ํ™˜์ž๋ฅผ ์œ„ํ•œ ์ €๋‹น์งˆ ํ•œ์‹ ์‹๋‹จ ๊ณ„ํš์„ ์„ธ์›Œ์ฃผ์„ธ์š”. ๋ผ๋‹ˆ๋ณ„ ๋ฉ”๋‰ด์™€ ์žฌ๋ฃŒ์˜ ์˜์–‘์ •๋ณด๊ฐ€ ๊ถ๊ธˆํ•ฉ๋‹ˆ๋‹ค."],
504
  ["ํŠน์ • ์งˆํ™˜(์˜ˆ: ์œ„๊ถค์–‘)์— ์ข‹์€ ์–‘์‹ ๋ ˆ์‹œํ”ผ๋ฅผ ๊ฐœ๋ฐœํ•˜๊ณ  ์‹ถ์Šต๋‹ˆ๋‹ค. ์ œ์•ˆ๊ณผ ๊ณผํ•™์  ๊ทผ๊ฑฐ๋ฅผ ์„ค๋ช…ํ•ด์ฃผ์„ธ์š”."],