Spaces:

likewendy
/

phi-4

Sleeping

App Files Files Community

likewendy commited on Jan 7

Commit

aca8f85

1 Parent(s): fbb6492

code

Browse files

Files changed (2) hide show

app.py +61 -31
bpp.py +0 -49

app.py CHANGED Viewed

@@ -1,13 +1,23 @@
-import os
 import gradio as gr
-from llama_cpp import Llama
-llm = Llama.from_pretrained(
-    repo_id="matteogeniaccio/phi-4",
-    filename="phi-4-Q4_K_M.gguf",
-    verbose=True
 )
 def respond(
     message,
     history: list[tuple[str, str]],
@@ -15,36 +25,54 @@ def respond(
     max_tokens,
     temperature,
     top_p,
 ):
-    # 构造消息内容
     messages = [{"role": "system", "content": system_message}]
-    for user_msg, assistant_msg in history:
-        if user_msg:
-            messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:
-            messages.append({"role": "assistant", "content": assistant_msg})
     messages.append({"role": "user", "content": message})
-    # 使用llama-cpp-python的方式生成响应
-    response = llm.create_chat_completion(
-        messages=messages,
-        max_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
-        stream=True
     )
-    # 流式响应处理
-    partial_message = ""
-    for chunk in response:
-        if chunk and chunk.get("choices") and chunk["choices"][0].get("delta", {}).get("content"):
-            content = chunk["choices"][0]["delta"]["content"]
-            partial_message += content
-            yield partial_message
-# Gradio 界面
 with gr.Blocks() as demo:
     gr.LoginButton(min_width=250)
     gr.ChatInterface(
         respond,
         additional_inputs=[
@@ -53,13 +81,15 @@ with gr.Blocks() as demo:
             gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
             gr.Slider(
                 minimum=0.1,
-                maximum=1.0,
-                value=0.95,
-                step=0.05,
-                label="Top-p (nucleus sampling)"
             ),
         ],
     )
 if __name__ == "__main__":
     demo.launch()

+import spaces
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 import gradio as gr
+import os
+from threading import Thread
+os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+model = AutoModelForCausalLM.from_pretrained(
+    "NyxKrage/Microsoft_Phi-4",
+    device_map="cuda",
+    torch_dtype="auto",
+    trust_remote_code=True,
 )
+tokenizer = AutoTokenizer.from_pretrained("NyxKrage/Microsoft_Phi-4")
+streamer = TextIteratorStreamer(tokenizer)
+@spaces.GPU
 def respond(
     message,
     history: list[tuple[str, str]],
     max_tokens,
     temperature,
     top_p,
+    seed,
 ):
     messages = [{"role": "system", "content": system_message}]
+    for val in history:
+        if val[0]:
+            messages.append({"role": "user", "content": val[0]})
+        if val[1]:
+            messages.append({"role": "assistant", "content": val[1]})
     messages.append({"role": "user", "content": message})
+    # Convert messages to the format expected by the model
+    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
+    torch.random.manual_seed(seed)
+    generation_kwargs = dict(
+        input_ids=input_ids,
+        max_new_tokens=max_tokens,
         temperature=temperature,
+        streamer=streamer,
         top_p=top_p,
+        do_sample=True,
     )
+    response = ""
+    # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    # Print the generated text in real-time
+    for new_text in streamer:
+        response += new_text
+    yield response
 with gr.Blocks() as demo:
     gr.LoginButton(min_width=250)
+    """
+    For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
+    """
+    gr.Markdown("""
+                This is the space I built.
+                As of 2025/1/7, this is the first phi-4 space.
+                If this helps you, and if you have enough money, can you give me 1$? I am facing a financial crisis.
+                If you do this, I will pass on the kindness.
+                This is my bank card number:5592921230414708
+                Thank you!!
+                """)
     gr.ChatInterface(
         respond,
         additional_inputs=[
             gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
             gr.Slider(
                 minimum=0.1,
+                maximum=1.0,
+                value=0.95,
+                step=0.05,
+                label="Top-p (nucleus sampling)",
             ),
+            gr.Slider(minimum=0, maximum=20091114, value=42, step=1, label="seed"),
         ],
     )
 if __name__ == "__main__":
     demo.launch()

bpp.py DELETED Viewed

@@ -1,49 +0,0 @@
-import spaces
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextIteratorStreamer
-import os
-# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
-os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
-torch.random.manual_seed(0)
-model = AutoModelForCausalLM.from_pretrained(
-    "NyxKrage/Microsoft_Phi-4",
-    device_map="cuda",
-    torch_dtype="auto",
-    trust_remote_code=True,
-)
-tokenizer = AutoTokenizer.from_pretrained("NyxKrage/Microsoft_Phi-4")
-messages = [
-    {"role": "system", "content": "You are a helpful AI assistant."},
-    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"},
-    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."},
-    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"},
-]
-pipe = pipeline(
-    "text-generation",
-    model=model,
-    tokenizer=tokenizer,
-)
-streamer = TextIteratorStreamer(tokenizer)
-generation_args = {
-    "max_new_tokens": 500,
-    "return_full_text": False,
-    "temperature": 0.0,
-    "do_sample": False,
-    "streamer": streamer,
-}
-@spaces.GPU
-def tuili():
-    model.generate(messages, **generation_args)
-tuili()
-for new_text in streamer:
-    print(new_text)