Jyothikamalesh commited on
Commit
2efa6f5
·
verified ·
1 Parent(s): faf7727

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -48
app.py CHANGED
@@ -1,8 +1,8 @@
1
- #refer llama recipes for more info https://github.com/huggingface/huggingface-llama-recipes/blob/main/inference-api.ipynb
2
- #huggingface-llama-recipes : https://github.com/huggingface/huggingface-llama-recipes/tree/main
3
  import gradio as gr
4
- from openai import OpenAI
5
  import os
 
 
6
 
7
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
8
 
@@ -11,59 +11,88 @@ client = OpenAI(
11
  api_key=ACCESS_TOKEN,
12
  )
13
 
14
- def respond(
 
 
15
  message,
16
- history: list[tuple[str, str]],
17
  system_message,
18
  max_tokens,
19
  temperature,
20
  top_p,
21
  ):
22
- messages = [{"role": "system", "content": system_message}]
 
 
 
23
 
24
- for val in history:
25
- if val[0]:
26
- messages.append({"role": "user", "content": val[0]})
27
- if val[1]:
28
- messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
 
29
 
30
- messages.append({"role": "user", "content": message})
 
 
 
 
31
 
32
- response = ""
33
-
34
- for message in client.chat.completions.create(
35
- model="NousResearch/Hermes-3-Llama-3.1-8B",
36
- max_tokens=max_tokens,
37
- stream=True,
38
- temperature=temperature,
39
- top_p=top_p,
40
- messages=messages,
41
- ):
42
- token = message.choices[0].delta.content
43
-
44
- response += token
45
- yield response
46
-
47
- chatbot = gr.Chatbot(height=600)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- demo = gr.ChatInterface(
50
- respond,
51
- additional_inputs=[
52
- gr.Textbox(value="", label="System message"),
53
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
54
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
55
- gr.Slider(
56
- minimum=0.1,
57
- maximum=1.0,
58
- value=0.95,
59
- step=0.05,
60
- label="Top-P",
61
- ),
62
-
63
- ],
64
- fill_height=True,
65
- chatbot=chatbot,
66
- theme="Nymbo/Alyx_Theme",
67
- )
68
  if __name__ == "__main__":
69
- demo.launch()
 
 
 
1
  import gradio as gr
2
+ from openai import OpenAI, APIError
3
  import os
4
+ import tenacity
5
+ import asyncio
6
 
7
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
8
 
 
11
  api_key=ACCESS_TOKEN,
12
  )
13
 
14
+ # Retry logic with tenacity for handling API rate limits
15
+ @tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10), stop=tenacity.stop_after_attempt(5))
16
+ async def respond(
17
  message,
 
18
  system_message,
19
  max_tokens,
20
  temperature,
21
  top_p,
22
  ):
23
+ try:
24
+ # Only use the system message and the current message for the response
25
+ messages = [{"role": "system", "content": system_message},
26
+ {"role": "user", "content": message}]
27
 
28
+ response = ""
29
+ # Properly stream chat completions using dot notation
30
+ stream = client.chat.completions.create(
31
+ model="NousResearch/Hermes-3-Llama-3.1-8B",
32
+ max_tokens=max_tokens,
33
+ stream=True,
34
+ temperature=temperature,
35
+ top_p=top_p,
36
+ messages=messages,
37
+ )
38
 
39
+ # Stream response and concatenate tokens
40
+ for chunk in stream:
41
+ if hasattr(chunk.choices[0].delta, 'content'):
42
+ token = chunk.choices[0].delta.content
43
+ response += token
44
 
45
+ return response
46
+
47
+ except APIError as e:
48
+ # Handle both string and dict types of error bodies
49
+ error_details = e.body
50
+ if isinstance(error_details, dict):
51
+ error_type = error_details.get("type", "Unknown")
52
+ error_code = error_details.get("code", "Unknown")
53
+ error_param = error_details.get("param", "Unknown")
54
+ error_message = error_details.get("message", "An error occurred.")
55
+ error_str = f"{error_type}: {error_message} (code: {error_code}, param: {error_param})"
56
+ else:
57
+ error_str = f"Error: {error_details}"
58
+
59
+ print(f"APIError: {error_str}")
60
+ return error_str
61
+
62
+ except Exception as e:
63
+ print(f"Exception: {e}")
64
+ return "Error occurred. Please try again."
65
+
66
+
67
+ # Async Gradio function to handle user input and response generation without history
68
+ async def generate_response(message, system_message, max_tokens, temperature, top_p):
69
+ response = await respond(message, system_message, max_tokens, temperature, top_p)
70
+ return response
71
+
72
+
73
+ def launch_app():
74
+ try:
75
+ demo = gr.Blocks()
76
+ with demo:
77
+ gr.Markdown("# Chatbot")
78
+ message = gr.Textbox(label="Message")
79
+ system_message = gr.Textbox(label="System message")
80
+ max_tokens = gr.Slider(minimum=1, maximum=2048, value=2048, step=1, label="Max new tokens")
81
+ temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
82
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
83
+ response = gr.Text(label="Response")
84
+
85
+ # Use the async version of generate_response without history
86
+ gr.Button("Generate Response").click(
87
+ generate_response,
88
+ inputs=[message, system_message, max_tokens, temperature, top_p],
89
+ outputs=[response],
90
+ show_progress=False,
91
+ )
92
+ demo.launch(show_error=True)
93
+ except KeyError as e:
94
+ print(f"Error: {e}")
95
+ print("Please try again.")
96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  if __name__ == "__main__":
98
+ launch_app()