seawolf2357 commited on
Commit
05dc4f5
ยท
verified ยท
1 Parent(s): 013e118

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -64
app.py CHANGED
@@ -5,8 +5,8 @@ import re
5
  import tempfile
6
  from collections.abc import Iterator
7
  from threading import Thread
8
- import json # โ† JSON ๋ณ€ํ™˜์„ ์œ„ํ•ด ์ถ”๊ฐ€
9
- import requests # SERPHouse web search
10
  import cv2
11
  import gradio as gr
12
  import spaces
@@ -17,26 +17,18 @@ from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIter
17
 
18
  # CSV/TXT ๋ถ„์„
19
  import pandas as pd
20
-
21
  # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
22
  import PyPDF2
23
 
24
  ##############################################################################
25
- # SERPHouse API key from environment variable (์‚ฌ์šฉ์ž๊ฐ€ ํ™˜๊ฒฝ๋ณ€์ˆ˜๋กœ ์ง€์ •ํ•ด์•ผ ํ•จ)
26
  ##############################################################################
27
  SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
28
 
29
  ##############################################################################
30
- # ๊ฐ„๋‹จํ•œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜ (์‚ฌ์šฉ์ž ํ”„๋กฌํ”„ํŠธ -> ํ‚ค์›Œ๋“œ)
31
  ##############################################################################
32
  def extract_keywords(text: str, top_k: int = 5) -> str:
33
- """
34
- ๊ฐ€์žฅ ๊ฐ„๋‹จํ•œ ์˜ˆ์‹œ:
35
- 1) ํ…์ŠคํŠธ๋ฅผ ์†Œ๋ฌธ์ž๋กœ
36
- 2) ์•ŒํŒŒ๋ฒณ/์ˆซ์ž/๊ณต๋ฐฑ ์ œ์™ธ ๋ฌธ์ž ์ œ๊ฑฐ
37
- 3) ๊ณต๋ฐฑ ํ† ํฐ ๋ถ„๋ฆฌ
38
- 4) ์•ž ํ† ํฐ n๊ฐœ ์ถ”์ถœ
39
- """
40
  text = text.lower()
41
  text = re.sub(r"[^a-z0-9\s]", "", text)
42
  tokens = text.split()
@@ -45,11 +37,12 @@ def extract_keywords(text: str, top_k: int = 5) -> str:
45
 
46
  ##############################################################################
47
  # SERPHouse Live endpoint ํ˜ธ์ถœ
48
- # - ์ƒ์œ„ 20๊ฐœ ๊ฒฐ๊ณผ ๋ชจ๋‘ "์ „์ฒด item"์„ system msg์— ๋‹ด์•„(=JSON ๊ทธ๋Œ€๋กœ) LLM์ด ์ฐธ์กฐ
49
  ##############################################################################
50
  def do_web_search(query: str) -> str:
51
  """
52
- SERPHouse ๋ผ์ด๋ธŒ ๊ฒ€์ƒ‰ ํ˜ธ์ถœ, ์ƒ์œ„ 20๊ฐœ 'organic' ๊ฒฐ๊ณผ ์ „์ฒด๋ฅผ JSON ํ˜•ํƒœ๋กœ ๋ฌถ์–ด์„œ ๋ฐ˜ํ™˜.
 
53
  """
54
  try:
55
  url = "https://api.serphouse.com/serp/live"
@@ -59,11 +52,11 @@ def do_web_search(query: str) -> str:
59
  "lang": "en",
60
  "device": "desktop",
61
  "serp_type": "web",
62
- "num_result": "20", # ์ƒ์œ„ 20๊ฐœ ๊ฒฐ๊ณผ
63
  "api_token": SERPHOUSE_API_KEY,
64
  }
65
  resp = requests.get(url, params=params, timeout=30)
66
- resp.raise_for_status() # 4xx/5xx ์—๋Ÿฌ ์‹œ ์˜ˆ์™ธ
67
  data = resp.json()
68
 
69
  results = data.get("results", {})
@@ -71,10 +64,9 @@ def do_web_search(query: str) -> str:
71
  if not organic:
72
  return "No web search results found."
73
 
74
- # ๊ฐ item์„ JSON(์ „์ฒด ํ•„๋“œ)์œผ๋กœ ๋ณ€ํ™˜ํ•˜์—ฌ ์ €์žฅ
75
  summary_lines = []
76
  for idx, item in enumerate(organic[:20], start=1):
77
- # item ์ „์ฒด๋ฅผ JSON ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜
78
  item_json = json.dumps(item, ensure_ascii=False, indent=2)
79
  summary_lines.append(f"Result {idx}:\n{item_json}\n")
80
 
@@ -85,9 +77,9 @@ def do_web_search(query: str) -> str:
85
 
86
 
87
  ##############################################################################
88
- # ์ƒ์ˆ˜ ์„ค์ •
89
  ##############################################################################
90
- MAX_CONTENT_CHARS = 4000 # ๋„ˆ๋ฌด ํฐ ํŒŒ์ผ์„ ๋ง‰๊ธฐ ์œ„ํ•ด ์ตœ๋Œ€ 4000์ž๋งŒ ํ‘œ์‹œ
91
  model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
92
  processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
93
  model = Gemma3ForConditionalGeneration.from_pretrained(
@@ -108,7 +100,6 @@ def analyze_csv_file(path: str) -> str:
108
  """
109
  try:
110
  df = pd.read_csv(path)
111
- # ์ตœ๋Œ€ 50ํ–‰, 10์—ด๊นŒ์ง€๋งŒ ํ‘œ์‹œ
112
  if df.shape[0] > 50 or df.shape[1] > 10:
113
  df = df.iloc[:50, :10]
114
  df_str = df.to_string()
@@ -193,13 +184,6 @@ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
193
 
194
 
195
  def validate_media_constraints(message: dict, history: list[dict]) -> bool:
196
- """
197
- - ๋น„๋””์˜ค 1๊ฐœ ์ดˆ๊ณผ ๋ถˆ๊ฐ€
198
- - ๋น„๋””์˜ค์™€ ์ด๋ฏธ์ง€ ํ˜ผํ•ฉ ๋ถˆ๊ฐ€
199
- - ์ด๋ฏธ์ง€ ๊ฐœ์ˆ˜(MAX_NUM_IMAGES) ์ดˆ๊ณผ ๋ถˆ๊ฐ€
200
- - <image> ํƒœ๊ทธ๊ฐ€ ์žˆ์œผ๋ฉด ํƒœ๊ทธ ์ˆ˜์™€ ์‹ค์ œ ์ด๋ฏธ์ง€ ์ˆ˜ ์ผ์น˜
201
- - CSV, TXT, PDF ๋“ฑ์€ ์—ฌ๊ธฐ์„œ ์ œํ•œํ•˜์ง€ ์•Š์Œ
202
- """
203
  media_files = []
204
  for f in message["files"]:
205
  if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
@@ -381,7 +365,7 @@ def process_history(history: list[dict]) -> list[dict]:
381
 
382
 
383
  ##############################################################################
384
- # ๋ฉ”์ธ ์ถ”๋ก  ํ•จ์ˆ˜ (web search ์ฒดํฌ ์‹œ ์ž๋™ ํ‚ค์›Œ๋“œ์ถ”์ถœ->๊ฒ€์ƒ‰->๊ฒฐ๊ณผ system msg ๋ฐ˜์˜)
385
  ##############################################################################
386
  @spaces.GPU(duration=120)
387
  def run(
@@ -398,47 +382,47 @@ def run(
398
  return
399
 
400
  try:
401
- # web_search๊ฐ€ True๋ฉด => message["text"]๋ฅผ ๊ธฐ๋ฐ˜์œผ๋กœ ํ‚ค์›Œ๋“œ ์ถ”์ถœํ•˜์—ฌ SERPHouse ํ˜ธ์ถœ
402
- history_system_msg = None
 
 
 
 
 
 
403
  if use_web_search:
404
  user_text = message["text"]
405
- # 1) ํ‚ค์›Œ๋“œ ์ถ”์ถœ
406
  ws_query = extract_keywords(user_text, top_k=5)
407
- logger.info(f"[Auto WebSearch Keyword] {ws_query!r}")
408
- # 2) ์ƒ์œ„ 20๊ฐœ ๊ฒฐ๊ณผ (item ์ „์ฒด) ๊ฐ€์ ธ์˜ค๊ธฐ
409
- ws_result = do_web_search(ws_query)
410
- # 3) ์ด๋ฅผ system ๋ฉ”์‹œ์ง€๋กœ ์ถ”๊ฐ€
411
- system_search_content = f"[Search top-20 Full Items Based on user prompt]\n{ws_result}\n"
412
- if system_search_content.strip():
413
- history_system_msg = {
414
- "role": "system",
415
- "content": [{"type": "text", "text": system_search_content}]
416
- }
417
  else:
418
- history_system_msg = {
419
- "role": "system",
420
- "content": [{"type": "text", "text": "No web search results"}]
421
- }
422
 
423
- # ๊ธฐ์กด system prompt
424
  messages = []
425
- if system_prompt:
426
- messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
427
- # web ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ system msg
428
- if history_system_msg:
429
- messages.append(history_system_msg)
430
-
431
- # ์ด์ „ ๋Œ€ํ™”์ด๋ ฅ(assistant/user)
 
432
  messages.extend(process_history(history))
433
-
434
- # ์ƒˆ ์œ ์ € ๋ฉ”์‹œ์ง€ ๋ณ€ํ™˜
435
  user_content = process_new_user_message(message)
436
  for item in user_content:
437
  if item["type"] == "text" and len(item["text"]) > MAX_CONTENT_CHARS:
438
  item["text"] = item["text"][:MAX_CONTENT_CHARS] + "\n...(truncated)..."
439
  messages.append({"role": "user", "content": user_content})
440
 
441
- # LLM ์ž…๋ ฅ ์ƒ์„ฑ
442
  inputs = processor.apply_chat_template(
443
  messages,
444
  add_generation_prompt=True,
@@ -453,7 +437,7 @@ def run(
453
  streamer=streamer,
454
  max_new_tokens=max_new_tokens,
455
  )
456
-
457
  t = Thread(target=model.generate, kwargs=gen_kwargs)
458
  t.start()
459
 
@@ -632,7 +616,7 @@ title_html = """
632
  </p>
633
  """
634
 
635
- with gr.Blocks(css=css, title="Vidraft-G3-27B-Multi-Search") as demo:
636
  gr.Markdown(title_html)
637
 
638
  with gr.Row():
@@ -669,9 +653,9 @@ with gr.Blocks(css=css, title="Vidraft-G3-27B-Multi-Search") as demo:
669
  value=2000,
670
  )
671
 
672
- gr.Markdown("<br><br>") # spacing
673
 
674
- # Main ChatInterface to the right
675
  with gr.Column(scale=7):
676
  chat = gr.ChatInterface(
677
  fn=run,
@@ -690,7 +674,7 @@ with gr.Blocks(css=css, title="Vidraft-G3-27B-Multi-Search") as demo:
690
  system_prompt_box,
691
  max_tokens_slider,
692
  web_search_checkbox,
693
- web_search_text, # ์‹ค์ œ๋กœ๋Š” auto search
694
  ],
695
  stop_btn=False,
696
  title="Vidraft-Gemma-3-27B",
@@ -706,9 +690,11 @@ with gr.Blocks(css=css, title="Vidraft-G3-27B-Multi-Search") as demo:
706
  gr.Markdown("### Example Inputs (click to load)")
707
  gr.Examples(
708
  examples=examples,
709
- inputs=[], # ๋งํฌํ•  inputs๊ฐ€ ์—†์œผ๋ฏ€๋กœ ๋นˆ ๋ฆฌ์ŠคํŠธ
710
  cache_examples=False
711
  )
712
 
713
  if __name__ == "__main__":
714
- demo.launch()
 
 
 
5
  import tempfile
6
  from collections.abc import Iterator
7
  from threading import Thread
8
+ import json
9
+ import requests
10
  import cv2
11
  import gradio as gr
12
  import spaces
 
17
 
18
  # CSV/TXT ๋ถ„์„
19
  import pandas as pd
 
20
  # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
21
  import PyPDF2
22
 
23
  ##############################################################################
24
+ # SERPHouse API key from environment variable
25
  ##############################################################################
26
  SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
27
 
28
  ##############################################################################
29
+ # ๊ฐ„๋‹จํ•œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
30
  ##############################################################################
31
  def extract_keywords(text: str, top_k: int = 5) -> str:
 
 
 
 
 
 
 
32
  text = text.lower()
33
  text = re.sub(r"[^a-z0-9\s]", "", text)
34
  tokens = text.split()
 
37
 
38
  ##############################################################################
39
  # SERPHouse Live endpoint ํ˜ธ์ถœ
40
+ # - ์ƒ์œ„ 20๊ฐœ ๊ฒฐ๊ณผ JSON์„ LLM์— ๋„˜๊ธธ ๋•Œ link, snippet ๋“ฑ ๋ชจ๋‘ ํฌํ•จ
41
  ##############################################################################
42
  def do_web_search(query: str) -> str:
43
  """
44
+ ์ƒ์œ„ 20๊ฐœ 'organic' ๊ฒฐ๊ณผ item ์ „์ฒด(์ œ๋ชฉ, link, snippet ๋“ฑ)๋ฅผ
45
+ JSON ๋ฌธ์ž์—ด ํ˜•ํƒœ๋กœ ๋ฐ˜ํ™˜
46
  """
47
  try:
48
  url = "https://api.serphouse.com/serp/live"
 
52
  "lang": "en",
53
  "device": "desktop",
54
  "serp_type": "web",
55
+ "num_result": "20",
56
  "api_token": SERPHOUSE_API_KEY,
57
  }
58
  resp = requests.get(url, params=params, timeout=30)
59
+ resp.raise_for_status()
60
  data = resp.json()
61
 
62
  results = data.get("results", {})
 
64
  if not organic:
65
  return "No web search results found."
66
 
 
67
  summary_lines = []
68
  for idx, item in enumerate(organic[:20], start=1):
69
+ # item ์ „์ฒด๋ฅผ JSON ๋ฌธ์ž์—ด๋กœ
70
  item_json = json.dumps(item, ensure_ascii=False, indent=2)
71
  summary_lines.append(f"Result {idx}:\n{item_json}\n")
72
 
 
77
 
78
 
79
  ##############################################################################
80
+ # ๋ชจ๋ธ/ํ”„๋กœ์„ธ์„œ ๋กœ๋”ฉ
81
  ##############################################################################
82
+ MAX_CONTENT_CHARS = 4000
83
  model_id = os.getenv("MODEL_ID", "google/gemma-3-27b-it")
84
  processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
85
  model = Gemma3ForConditionalGeneration.from_pretrained(
 
100
  """
101
  try:
102
  df = pd.read_csv(path)
 
103
  if df.shape[0] > 50 or df.shape[1] > 10:
104
  df = df.iloc[:50, :10]
105
  df_str = df.to_string()
 
184
 
185
 
186
  def validate_media_constraints(message: dict, history: list[dict]) -> bool:
 
 
 
 
 
 
 
187
  media_files = []
188
  for f in message["files"]:
189
  if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
 
365
 
366
 
367
  ##############################################################################
368
+ # ๋ฉ”์ธ ์ถ”๋ก  ํ•จ์ˆ˜ (web search ์ฒดํฌ ์‹œ ์ž๋™ ํ‚ค์›Œ๋“œ์ถ”์ถœ->๊ฒ€์ƒ‰->๊ฒฐ๊ณผ system msg)
369
  ##############################################################################
370
  @spaces.GPU(duration=120)
371
  def run(
 
382
  return
383
 
384
  try:
385
+ # (1) system ๋ฉ”์‹œ์ง€๋ฅผ ํ•˜๋‚˜๋กœ ํ•ฉ์น˜๊ธฐ ์œ„ํ•ด, ๋ฏธ๋ฆฌ buffer
386
+ combined_system_msg = ""
387
+
388
+ # ์‚ฌ์šฉ์ž๊ฐ€ system_prompt๋ฅผ ์ž…๋ ฅํ–ˆ๋‹ค๋ฉด
389
+ if system_prompt.strip():
390
+ combined_system_msg += f"[System Prompt]\n{system_prompt.strip()}\n\n"
391
+
392
+ # (2) ์›น ๊ฒ€์ƒ‰ ์ฒดํฌ ์‹œ, ํ‚ค์›Œ๋“œ ์ถ”์ถœ
393
  if use_web_search:
394
  user_text = message["text"]
 
395
  ws_query = extract_keywords(user_text, top_k=5)
396
+ # ๋งŒ์•ฝ ์ถ”์ถœ ํ‚ค์›Œ๋“œ๊ฐ€ ๋น„์–ด์žˆ์œผ๋ฉด ๊ฒ€์ƒ‰์„ ๊ฑด๋„ˆ๋œ€
397
+ if ws_query.strip():
398
+ logger.info(f"[Auto WebSearch Keyword] {ws_query!r}")
399
+ ws_result = do_web_search(ws_query)
400
+ # ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์‹œ์Šคํ…œ ๋ฉ”์‹œ์ง€ ๋์— ํ•ฉ์นจ
401
+ combined_system_msg += f"[Search top-20 Full Items Based on user prompt]\n{ws_result}\n\n"
 
 
 
 
402
  else:
403
+ # ์ถ”์ถœ๋œ ํ‚ค์›Œ๋“œ๊ฐ€ ์—†์œผ๋ฉด ๊ตณ์ด ๊ฒ€์ƒ‰ ์‹œ๋„ ์•ˆ ํ•จ
404
+ combined_system_msg += "[No valid keywords found, skipping WebSearch]\n\n"
 
 
405
 
406
+ # (3) system ๋ฉ”์‹œ์ง€๊ฐ€ ์ตœ์ข…์ ์œผ๋กœ ๋น„์–ด ์žˆ์ง€ ์•Š๋‹ค๋ฉด
407
  messages = []
408
+ if combined_system_msg.strip():
409
+ # system ์—ญํ•  ๋ฉ”์‹œ์ง€ ํ•˜๋‚˜ ์ƒ์„ฑ
410
+ messages.append({
411
+ "role": "system",
412
+ "content": [{"type": "text", "text": combined_system_msg.strip()}],
413
+ })
414
+
415
+ # (4) ์ด์ „ ๋Œ€ํ™”์ด๋ ฅ
416
  messages.extend(process_history(history))
417
+
418
+ # (5) ์ƒˆ ์œ ์ € ๋ฉ”์‹œ์ง€
419
  user_content = process_new_user_message(message)
420
  for item in user_content:
421
  if item["type"] == "text" and len(item["text"]) > MAX_CONTENT_CHARS:
422
  item["text"] = item["text"][:MAX_CONTENT_CHARS] + "\n...(truncated)..."
423
  messages.append({"role": "user", "content": user_content})
424
 
425
+ # (6) LLM ์ž…๋ ฅ ์ƒ์„ฑ
426
  inputs = processor.apply_chat_template(
427
  messages,
428
  add_generation_prompt=True,
 
437
  streamer=streamer,
438
  max_new_tokens=max_new_tokens,
439
  )
440
+
441
  t = Thread(target=model.generate, kwargs=gen_kwargs)
442
  t.start()
443
 
 
616
  </p>
617
  """
618
 
619
+ with gr.Blocks(css=css, title="Vidraft-Gemma-3-27B") as demo:
620
  gr.Markdown(title_html)
621
 
622
  with gr.Row():
 
653
  value=2000,
654
  )
655
 
656
+ gr.Markdown("<br><br>")
657
 
658
+ # Main ChatInterface
659
  with gr.Column(scale=7):
660
  chat = gr.ChatInterface(
661
  fn=run,
 
674
  system_prompt_box,
675
  max_tokens_slider,
676
  web_search_checkbox,
677
+ web_search_text,
678
  ],
679
  stop_btn=False,
680
  title="Vidraft-Gemma-3-27B",
 
690
  gr.Markdown("### Example Inputs (click to load)")
691
  gr.Examples(
692
  examples=examples,
693
+ inputs=[], # ์—ฐ๊ฒฐํ•  inputs๊ฐ€ ์—†์œผ๋ฏ€๋กœ ๋นˆ ๋ฆฌ์ŠคํŠธ
694
  cache_examples=False
695
  )
696
 
697
  if __name__ == "__main__":
698
+ # share=True ํ•˜์‹œ๋ฉด public URL ์‚ฌ์šฉ ๊ฐ€๋Šฅ
699
+ demo.launch(share=True)
700
+