import gradio as gr from transformers import pipeline, AutoTokenizer, TextIteratorStreamer import torch import spaces from threading import Thread import os @spaces.GPU def load_model(model_name): return pipeline("text-generation", model=model_name, device_map="cuda", torch_dtype=torch.bfloat16, trust_remote_code=True, token=os.environ["token"]) @spaces.GPU() def generate( model_name, user_input, temperature=0.4, top_p=0.95, top_k=50, max_new_tokens=256, ): pipe = load_model(model_name) # Set tokenize correctly. Otherwise ticking the box breaks it. if model_name == "M4-ai/tau-1.8B": prompt = user_input else: prompt = f"<|im_start|>system\nYou are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps.<|im_end|>\n<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n" streamer = TextIteratorStreamer(pipe.tokenizer, timeout=240.0, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict(text_inputs=prompt, streamer=streamer, max_new_tokens=max_new_tokens, do_sample=True, top_p=top_p, top_k=top_k, temperature=temperature, num_beams=1, repetition_penalty=1.1) t = Thread(target=pipe.__call__, kwargs=generation_kwargs) t.start() outputs = [] for chunk in streamer: outputs.append(chunk) yield "".join(outputs) model_choices = ["M4-ai/Hercules-Mini-1.8B", "Locutusque/Hyperion-3.0-Mistral-7B-DPO", "Locutusque/OpenCerebrum-1.5-Mistral-11B-Evolved-beta", "M4-ai/tau-1.8B", "Locutusque/OpenCerebrum-1.5-Mistral-7b-v0.2-alpha", "Locutusque/SlimHercules-4.0-Mistral-7B-v0.2", "Locutusque/Hercules-3.1-Mistral-7B"] # What at the best options? g = gr.Interface( fn=generate, inputs=[ gr.components.Dropdown(choices=model_choices, label="Model", value=model_choices[0], interactive=True), gr.components.Textbox(lines=2, label="Prompt", value="Write me a Python program that calculates the factorial of a given number."), gr.components.Slider(minimum=0, maximum=1, value=0.4, label="Temperature"), gr.components.Slider(minimum=0, maximum=1, value=0.95, label="Top p"), gr.components.Slider(minimum=0, maximum=100, step=1, value=50, label="Top k"), gr.components.Slider(minimum=1, maximum=2048, step=1, value=1024, label="Max tokens"), ], outputs=[gr.Textbox(lines=10, label="Output")], title="Locutusque's Language Models", description="Try out Locutusque's (or other's) language models here! Credit goes to Mediocreatmybest for this space. You may also find some experimental preview models that have not been made public here.", concurrency_limit=1 ) g.launch(max_threads=4)