likewendy commited on
Commit
d5e204e
·
1 Parent(s): d2baa2c
Files changed (1) hide show
  1. app.py +16 -15
app.py CHANGED
@@ -1,10 +1,12 @@
1
- import spaces
2
  import gradio as gr
3
  from llama_cpp import Llama
4
- import os
5
 
6
- # 响应函数
7
- @spaces.GPU
 
 
 
 
8
  def respond(
9
  message,
10
  history: list[tuple[str, str]],
@@ -22,26 +24,22 @@ def respond(
22
  messages.append({"role": "assistant", "content": assistant_msg})
23
  messages.append({"role": "user", "content": message})
24
 
25
- llm = Llama.from_pretrained(
26
- repo_id="matteogeniaccio/phi-4",
27
- filename="phi-4-Q4_K_M.gguf",
28
- verbose=True,
29
- main_gpu=1,
30
- n_gpu_layers=-1
31
- )
32
  # 使用llama-cpp-python的方式生成响应
33
  response = llm.create_chat_completion(
34
  messages=messages,
35
  max_tokens=max_tokens,
36
  temperature=temperature,
37
  top_p=top_p,
38
- stream=False
39
  )
40
 
41
- # 返回流式响应
 
42
  for chunk in response:
43
  if chunk and chunk.get("choices") and chunk["choices"][0].get("delta", {}).get("content"):
44
- yield chunk["choices"][0]["delta"]["content"]
 
 
45
 
46
  # Gradio 界面
47
  demo = gr.ChatInterface(
@@ -52,7 +50,10 @@ demo = gr.ChatInterface(
52
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
  gr.Slider(
54
  minimum=0.1,
55
- maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"
 
 
 
56
  ),
57
  ],
58
  )
 
 
1
  import gradio as gr
2
  from llama_cpp import Llama
 
3
 
4
+ llm = Llama.from_pretrained(
5
+ repo_id="matteogeniaccio/phi-4",
6
+ filename="phi-4-Q4_K_M.gguf",
7
+ verbose=True
8
+ )
9
+
10
  def respond(
11
  message,
12
  history: list[tuple[str, str]],
 
24
  messages.append({"role": "assistant", "content": assistant_msg})
25
  messages.append({"role": "user", "content": message})
26
 
 
 
 
 
 
 
 
27
  # 使用llama-cpp-python的方式生成响应
28
  response = llm.create_chat_completion(
29
  messages=messages,
30
  max_tokens=max_tokens,
31
  temperature=temperature,
32
  top_p=top_p,
33
+ stream=True
34
  )
35
 
36
+ # 流式响应处理
37
+ partial_message = ""
38
  for chunk in response:
39
  if chunk and chunk.get("choices") and chunk["choices"][0].get("delta", {}).get("content"):
40
+ content = chunk["choices"][0]["delta"]["content"]
41
+ partial_message += content
42
+ yield partial_message
43
 
44
  # Gradio 界面
45
  demo = gr.ChatInterface(
 
50
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
51
  gr.Slider(
52
  minimum=0.1,
53
+ maximum=1.0,
54
+ value=0.95,
55
+ step=0.05,
56
+ label="Top-p (nucleus sampling)"
57
  ),
58
  ],
59
  )