phk0 commited on
Commit
eadaa23
·
verified ·
1 Parent(s): b983700

copied from https://huggingface.co/spaces/prithivMLmods/Llama-3.1-70B/blob/main/app.py

Browse files
Files changed (1) hide show
  1. app.py +63 -80
app.py CHANGED
@@ -1,91 +1,74 @@
1
- from huggingface_hub import InferenceClient
 
2
  import gradio as gr
 
 
3
 
4
- llm="meta-llama/Meta-Llama-3.1-70B"
 
 
 
 
 
 
5
 
 
6
 
7
- client = InferenceClient(llm)
 
 
 
8
 
 
 
 
 
 
 
 
 
 
9
 
10
- def format_prompt(message, history):
11
- prompt = "<s>"
12
- for user_prompt, bot_response in history:
13
- prompt += f"[INST] {user_prompt} [/INST]"
14
- prompt += f" {bot_response}</s> "
15
- prompt += f"[INST] {message} [/INST]"
16
- return prompt
17
 
18
- def generate(
19
- prompt, history, temperature=0.9, max_new_tokens=256, top_p=0.95, repetition_penalty=1.0,
20
- ):
21
- temperature = float(temperature)
22
- if temperature < 1e-2:
23
- temperature = 1e-2
24
- top_p = float(top_p)
25
 
26
- generate_kwargs = dict(
 
 
 
 
 
27
  temperature=temperature,
28
- max_new_tokens=max_new_tokens,
29
  top_p=top_p,
30
- repetition_penalty=repetition_penalty,
31
- do_sample=True,
32
- seed=42,
33
- )
34
-
35
- formatted_prompt = format_prompt(prompt, history)
36
-
37
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
38
- output = ""
39
-
40
- for response in stream:
41
- output += response.token.text
42
- yield output
43
- return output
44
-
45
-
46
- additional_inputs=[
47
- gr.Slider(
48
- label="Temperature",
49
- value=0.9,
50
- minimum=0.0,
51
- maximum=1.0,
52
- step=0.05,
53
- interactive=True,
54
- info="Higher values produce more diverse outputs",
55
- ),
56
- gr.Slider(
57
- label="Max new tokens",
58
- value=256,
59
- minimum=0,
60
- maximum=1048,
61
- step=64,
62
- interactive=True,
63
- info="The maximum numbers of new tokens",
64
- ),
65
- gr.Slider(
66
- label="Top-p (nucleus sampling)",
67
- value=0.90,
68
- minimum=0.0,
69
- maximum=1,
70
- step=0.05,
71
- interactive=True,
72
- info="Higher values sample more low-probability tokens",
73
- ),
74
- gr.Slider(
75
- label="Repetition penalty",
76
- value=1.2,
77
- minimum=1.0,
78
- maximum=2.0,
79
- step=0.05,
80
- interactive=True,
81
- info="Penalize repeated tokens",
82
- )
83
- ]
84
-
85
 
86
- gr.ChatInterface(
87
- fn=generate,
88
- chatbot=gr.Chatbot(show_label=False, show_share_button=False, show_copy_button=True, likeable=True, layout="panel"),
89
- additional_inputs=additional_inputs,
90
- title=llm
91
- ).launch(show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #refer llama recipes for more info https://github.com/huggingface/huggingface-llama-recipes/blob/main/inference-api.ipynb
2
+ #huggingface-llama-recipes : https://github.com/huggingface/huggingface-llama-recipes/tree/main
3
  import gradio as gr
4
+ from openai import OpenAI
5
+ import os
6
 
7
+ css = '''
8
+ .gradio-container{max-width: 890px !important}
9
+ h1{text-align:center}
10
+ footer {
11
+ visibility: hidden
12
+ }
13
+ '''
14
 
15
+ ACCESS_TOKEN = os.getenv("HF_TOKEN")
16
 
17
+ client = OpenAI(
18
+ base_url="https://api-inference.huggingface.co/v1/",
19
+ api_key=ACCESS_TOKEN,
20
+ )
21
 
22
+ def respond(
23
+ message,
24
+ history: list[tuple[str, str]],
25
+ system_message,
26
+ max_tokens,
27
+ temperature,
28
+ top_p,
29
+ ):
30
+ messages = [{"role": "system", "content": system_message}]
31
 
32
+ for val in history:
33
+ if val[0]:
34
+ messages.append({"role": "user", "content": val[0]})
35
+ if val[1]:
36
+ messages.append({"role": "assistant", "content": val[1]})
 
 
37
 
38
+ messages.append({"role": "user", "content": message})
 
 
 
 
 
 
39
 
40
+ response = ""
41
+
42
+ for message in client.chat.completions.create(
43
+ model="meta-llama/Meta-Llama-3.1-70B-Instruct",
44
+ max_tokens=max_tokens,
45
+ stream=True,
46
  temperature=temperature,
 
47
  top_p=top_p,
48
+ messages=messages,
49
+ ):
50
+ token = message.choices[0].delta.content
51
+
52
+ response += token
53
+ yield response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
+ demo = gr.ChatInterface(
56
+ respond,
57
+ additional_inputs=[
58
+ gr.Textbox(value="", label="System message"),
59
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
60
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
61
+ gr.Slider(
62
+ minimum=0.1,
63
+ maximum=1.0,
64
+ value=0.95,
65
+ step=0.05,
66
+ label="Top-P",
67
+ ),
68
+
69
+ ],
70
+ css=css,
71
+ theme="allenai/gradio-theme",
72
+ )
73
+ if __name__ == "__main__":
74
+ demo.launch()