DeepHermes

Running on Zero

App Files Files Community

vilarin commited on 5 days ago

Commit

f494de5

verified ·

1 Parent(s): d6a6e58

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -22

app.py CHANGED Viewed

@@ -1,16 +1,22 @@
 import os
 import time
 import spaces
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
-MODEL = "evabyte/EvaByte-SFT"
-MODEL_BASE = "evabyte/EvaByte"
-TITLE = "<h1><center>EvaByte</center></h1>"
 PLACEHOLDER = """
 <center>
@@ -33,12 +39,23 @@ h3 {
 device = "cuda" # for GPU usage or "cpu" for CPU usage
-tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
-    torch_dtype=torch.bfloat16,
     device_map="auto",
-    trust_remote_code=True).eval().to(device)
 @spaces.GPU()
 def stream_chat(
@@ -46,8 +63,10 @@ def stream_chat(
     history: list,
     system_prompt: str,
     temperature: float = 0.8,
-    max_new_tokens: int = 512,
     top_p: float = 1.0,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
@@ -63,26 +82,33 @@ def stream_chat(
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(device)
-    gen_out = model.multi_byte_generate(
         input_ids=input_ids,
         max_new_tokens = max_new_tokens,
         do_sample = False if temperature == 0 else True,
         top_p = top_p,
         temperature = temperature,
     )
-    response = tokenizer.decode(
-        gen_out[0][input_ids.shape[1]:],
-        skip_special_tokens=False,
-        clean_up_tokenization_spaces=False
-    )
-    for i in range(len(response)):
-        time.sleep(0.02)
-        yield response[: i + 1]
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
@@ -95,7 +121,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             gr.Textbox(
-                value="You are a helpful assistant.",
                 label="System Prompt",
                 lines=5,
                 render=False,
@@ -112,7 +138,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 minimum=128,
                 maximum=8192,
                 step=1,
-                value= 512,
                 label="Max new tokens",
                 render=False,
             ),
@@ -124,6 +150,22 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 label="top_p",
                 render=False,
             ),
         ],
         examples=[
             ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],

+import subprocess
+subprocess.run(
+    'pip install flash-attn --no-build-isolation',
+    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
+    shell=True
+)
 import os
 import time
 import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
 import gradio as gr
 from threading import Thread
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
+MODEL = "NousResearch/DeepHermes-3-Llama-3-8B-Preview"
+TITLE = "<h1><center>DeepHermes-3-Llama-3-8B</center></h1>"
 PLACEHOLDER = """
 <center>
 device = "cuda" # for GPU usage or "cpu" for CPU usage
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type= "nf4")
+tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL,
+    torch_dtype=torch.float16,
     device_map="auto",
+    attn_implementation="flash_attention_2",
+    quantization_config=quantization_config)
+# Ensure `pad_token_id` is set
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token_id = tokenizer.eos_token_id
 @spaces.GPU()
 def stream_chat(
     history: list,
     system_prompt: str,
     temperature: float = 0.8,
+    max_new_tokens: int = 2500,
     top_p: float = 1.0,
+    top_k: int = 20,
+    penalty: float = 1.1,
 ):
     print(f'message: {message}')
     print(f'history: {history}')
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
         input_ids=input_ids,
         max_new_tokens = max_new_tokens,
         do_sample = False if temperature == 0 else True,
         top_p = top_p,
+        top_k = top_k,
+        eos_token_id = tokenizer.eos_token_id,
+        pad_token_id = tokenizer.pad_token_id,
         temperature = temperature,
+        repetition_penalty=penalty,
+        streamer=streamer,
     )
+    with torch.no_grad():
+        thread = Thread(target=model.generate, kwargs=generate_kwargs)
+        thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        yield buffer
 chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
 with gr.Blocks(css=CSS, theme="soft") as demo:
         additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
         additional_inputs=[
             gr.Textbox(
+                value="You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your solution or response to the problem.",
                 label="System Prompt",
                 lines=5,
                 render=False,
                 minimum=128,
                 maximum=8192,
                 step=1,
+                value= 2500,
                 label="Max new tokens",
                 render=False,
             ),
                 label="top_p",
                 render=False,
             ),
+            gr.Slider(
+                minimum=1,
+                maximum=20,
+                step=1,
+                value=20,
+                label="top_k",
+                render=False,
+            ),
+            gr.Slider(
+                minimum=0.0,
+                maximum=2.0,
+                step=0.1,
+                value=1.1,
+                label="Repetition penalty",
+                render=False,
+            ),
         ],
         examples=[
             ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."],