likewendy commited on
Commit
aca8f85
·
1 Parent(s): fbb6492
Files changed (2) hide show
  1. app.py +61 -31
  2. bpp.py +0 -49
app.py CHANGED
@@ -1,13 +1,23 @@
1
- import os
 
 
2
  import gradio as gr
3
- from llama_cpp import Llama
 
 
 
4
 
5
- llm = Llama.from_pretrained(
6
- repo_id="matteogeniaccio/phi-4",
7
- filename="phi-4-Q4_K_M.gguf",
8
- verbose=True
 
9
  )
 
10
 
 
 
 
11
  def respond(
12
  message,
13
  history: list[tuple[str, str]],
@@ -15,36 +25,54 @@ def respond(
15
  max_tokens,
16
  temperature,
17
  top_p,
 
18
  ):
19
- # 构造消息内容
20
  messages = [{"role": "system", "content": system_message}]
21
- for user_msg, assistant_msg in history:
22
- if user_msg:
23
- messages.append({"role": "user", "content": user_msg})
24
- if assistant_msg:
25
- messages.append({"role": "assistant", "content": assistant_msg})
 
 
26
  messages.append({"role": "user", "content": message})
27
 
28
- # 使用llama-cpp-python的方式生成响应
29
- response = llm.create_chat_completion(
30
- messages=messages,
31
- max_tokens=max_tokens,
 
 
 
 
32
  temperature=temperature,
 
33
  top_p=top_p,
34
- stream=True
35
  )
36
-
37
- # 流式响应处理
38
- partial_message = ""
39
- for chunk in response:
40
- if chunk and chunk.get("choices") and chunk["choices"][0].get("delta", {}).get("content"):
41
- content = chunk["choices"][0]["delta"]["content"]
42
- partial_message += content
43
- yield partial_message
44
-
45
- # Gradio 界面
46
  with gr.Blocks() as demo:
47
  gr.LoginButton(min_width=250)
 
 
 
 
 
 
 
 
 
 
 
48
  gr.ChatInterface(
49
  respond,
50
  additional_inputs=[
@@ -53,13 +81,15 @@ with gr.Blocks() as demo:
53
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
54
  gr.Slider(
55
  minimum=0.1,
56
- maximum=1.0,
57
- value=0.95,
58
- step=0.05,
59
- label="Top-p (nucleus sampling)"
60
  ),
 
61
  ],
62
  )
63
 
 
64
  if __name__ == "__main__":
65
  demo.launch()
 
1
+ import spaces
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  import gradio as gr
5
+ import os
6
+ from threading import Thread
7
+
8
+ os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
9
 
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ "NyxKrage/Microsoft_Phi-4",
12
+ device_map="cuda",
13
+ torch_dtype="auto",
14
+ trust_remote_code=True,
15
  )
16
+ tokenizer = AutoTokenizer.from_pretrained("NyxKrage/Microsoft_Phi-4")
17
 
18
+ streamer = TextIteratorStreamer(tokenizer)
19
+
20
+ @spaces.GPU
21
  def respond(
22
  message,
23
  history: list[tuple[str, str]],
 
25
  max_tokens,
26
  temperature,
27
  top_p,
28
+ seed,
29
  ):
 
30
  messages = [{"role": "system", "content": system_message}]
31
+
32
+ for val in history:
33
+ if val[0]:
34
+ messages.append({"role": "user", "content": val[0]})
35
+ if val[1]:
36
+ messages.append({"role": "assistant", "content": val[1]})
37
+
38
  messages.append({"role": "user", "content": message})
39
 
40
+ # Convert messages to the format expected by the model
41
+ input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
42
+
43
+ torch.random.manual_seed(seed)
44
+
45
+ generation_kwargs = dict(
46
+ input_ids=input_ids,
47
+ max_new_tokens=max_tokens,
48
  temperature=temperature,
49
+ streamer=streamer,
50
  top_p=top_p,
51
+ do_sample=True,
52
  )
53
+
54
+ response = ""
55
+ # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
56
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
57
+ thread.start()
58
+ # Print the generated text in real-time
59
+ for new_text in streamer:
60
+ response += new_text
61
+ yield response
62
+
63
  with gr.Blocks() as demo:
64
  gr.LoginButton(min_width=250)
65
+ """
66
+ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
67
+ """
68
+ gr.Markdown("""
69
+ This is the space I built.
70
+ As of 2025/1/7, this is the first phi-4 space.
71
+ If this helps you, and if you have enough money, can you give me 1$? I am facing a financial crisis.
72
+ If you do this, I will pass on the kindness.
73
+ This is my bank card number:5592921230414708
74
+ Thank you!!
75
+ """)
76
  gr.ChatInterface(
77
  respond,
78
  additional_inputs=[
 
81
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
82
  gr.Slider(
83
  minimum=0.1,
84
+ maximum=1.0,
85
+ value=0.95,
86
+ step=0.05,
87
+ label="Top-p (nucleus sampling)",
88
  ),
89
+ gr.Slider(minimum=0, maximum=20091114, value=42, step=1, label="seed"),
90
  ],
91
  )
92
 
93
+
94
  if __name__ == "__main__":
95
  demo.launch()
bpp.py DELETED
@@ -1,49 +0,0 @@
1
- import spaces
2
- import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
4
-
5
- import os
6
-
7
- # PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
8
- os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
9
-
10
- torch.random.manual_seed(0)
11
-
12
- model = AutoModelForCausalLM.from_pretrained(
13
- "NyxKrage/Microsoft_Phi-4",
14
- device_map="cuda",
15
- torch_dtype="auto",
16
- trust_remote_code=True,
17
- )
18
- tokenizer = AutoTokenizer.from_pretrained("NyxKrage/Microsoft_Phi-4")
19
-
20
- messages = [
21
- {"role": "system", "content": "You are a helpful AI assistant."},
22
- {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
23
- {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
24
- {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
25
- ]
26
-
27
- pipe = pipeline(
28
- "text-generation",
29
- model=model,
30
- tokenizer=tokenizer,
31
- )
32
-
33
- streamer = TextIteratorStreamer(tokenizer)
34
-
35
- generation_args = {
36
- "max_new_tokens": 500,
37
- "return_full_text": False,
38
- "temperature": 0.0,
39
- "do_sample": False,
40
- "streamer": streamer,
41
- }
42
-
43
- @spaces.GPU
44
- def tuili():
45
- model.generate(messages, **generation_args)
46
-
47
- tuili()
48
- for new_text in streamer:
49
- print(new_text)