Spaces:

Locutusque
/

Locutusque-Models

Running on Zero

App Files Files

xet

Community

Locutusque commited on Sep 2, 2024

Commit

6615fb0

verified ·

1 Parent(s): bfe628d

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -26

app.py CHANGED Viewed

@@ -10,53 +10,69 @@ def load_model(model_name):
     return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
 @spaces.GPU()
 def generate(
     model_name,
     system,
-    user_input,
     temperature=0.4,
     top_p=0.95,
     min_p=0.1,
     top_k=50,
     max_new_tokens=256,
 ):
-    pipe = load_model(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
-    print(tokenizer)
-    pipe.tokenizer = tokenizer
-    # Set tokenize correctly. Otherwise ticking the box breaks it.
-    if model_name == "M4-ai/tau-1.8B":
-        prompt = user_input
-    else:
-        prompt = f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n"
-    streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(text_inputs=prompt, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_p=top_p, min_p=min_p, top_k=top_k,
-                              temperature=temperature, num_beams=1, repetition_penalty=1.1)
-    t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
-    t.start()
-    outputs = []
-    for chunk in streamer:
-        outputs.append(chunk)
         yield "".join(outputs)
-model_choices = ["Locutusque/Apollo-2.0-Nemo-2407-12B", "Locutusque/Apollo-2.0-Llama-3.1-8B", "Locutusque/Llama-3-NeuralHermes-Pro-8B", "Locutusque/Hercules-5.0-Qwen2-7B", "Locutusque/Llama-3-NeuralHercules-5.0-8B", "Locutusque/Hercules-5.0-Index-1.9B", "Locutusque/Llama-3-Hercules-5.0-8B"]
 # What at the best options?
-g = gr.Interface(
     fn=generate,
-    inputs=[
         gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
         gr.components.Textbox(lines=2, label="System Prompt", value="You are an AI."),
-        gr.components.Textbox(lines=2, label="Prompt", value="Write me a Python program that calculates the factorial of a given number."),
         gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
         gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
         gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
         gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
         gr.components.Slider(minimum=1, maximum=2048, step=1, value=1024, label="Max tokens"),
     ],
-    outputs=[gr.Textbox(lines=10, label="Output")],
     title="Locutusque's Language Models",
     description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
-    concurrency_limit=1
 )
-g.launch(max_threads=4)

     return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"], use_fast=True)
 @spaces.GPU()
 def generate(
+    history,
     model_name,
     system,
     temperature=0.4,
     top_p=0.95,
     min_p=0.1,
     top_k=50,
     max_new_tokens=256,
 ):
+    try:
+        pipe = load_model(model_name)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, token=os.environ["token"])
+        print(tokenizer)
+        pipe.tokenizer = tokenizer
+        prompt = f"<|im_start|>system\n{system}<|im_end|>\n"
+        for (user_turn, assistant_turn) in history:
+            prompt += f"<|im_start|>user\n{user_turn}<|im_end|>\n<|im_start|>assistant\n{assistant_turn}<|im_end|>\n"
+        prompt += f"<|im_start|>user\n{history[-1][0]}<|im_end|>\n<|im_start|>assistant\n"
+        streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True)
+        generation_kwargs = dict(
+            text_inputs=prompt,
+            streamer=streamer,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            top_p=top_p,
+            min_p=min_p,
+            top_k=top_k,
+            temperature=temperature,
+            num_beams=1,
+            repetition_penalty=1.1
+        )
+        t = Thread(target=pipe.__call__, kwargs=generation_kwargs)
+        t.start()
+        outputs = []
+        for chunk in streamer:
+            outputs.append(chunk)
+            yield "".join(outputs)
+    except StopAsyncIteration:
+        print("Stream stopped unexpectedly.")
         yield "".join(outputs)
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        yield "An error occurred during generation."
+model_choices = ["Locutusque/Apollo-2.0-Llama-3.1-8B", "Locutusque/Llama-3-NeuralHermes-Pro-8B", "Locutusque/Hercules-5.0-Qwen2-7B", "Locutusque/Llama-3-NeuralHercules-5.0-8B", "Locutusque/Hercules-5.0-Index-1.9B", "Locutusque/Llama-3-Hercules-5.0-8B"]
 # What at the best options?
+g = gr.ChatInterface(
     fn=generate,
+    additional_inputs=[
         gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True),
         gr.components.Textbox(lines=2, label="System Prompt", value="You are an AI."),
         gr.components.Slider(minimum=0, maximum=2, value=0.8, label="Temperature"),
         gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"),
         gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Min P"),
         gr.components.Slider(minimum=0, maximum=100, step=1, value=15, label="Top k"),
         gr.components.Slider(minimum=1, maximum=2048, step=1, value=1024, label="Max tokens"),
     ],
     title="Locutusque's Language Models",
     description="Try out Locutusque's language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.",
 )
+if __name__ == "__main__":
+    g.launch()