File size: 5,347 Bytes
992ad15 2ae746c 9307da2 2ae746c a2232d8 def624a a2232d8 bd16ace a2232d8 2ae746c def624a b8599dd 2ae746c ce6efbb 2fb3212 81c62e1 2ae746c 2fb3212 992ad15 493f720 5cb071c 493f720 992ad15 9307da2 992ad15 def624a 992ad15 ce6efbb 992ad15 a2232d8 992ad15 a2232d8 992ad15 2ae746c 992ad15 a2232d8 992ad15 2ae746c 992ad15 ce6efbb 992ad15 a2232d8 992ad15 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
#!/usr/bin/env python#
from llama_cpp import Llama
from time import time
import gradio as gr
import psutil
import os
# load like this - use tne variable everywhere
model_hf_path=os.getenv("MODEL_HF_PATH")
# show warning, when empty and briefs description of how to set it
# also add link to "how to search" with link to bloke by default + example search link + example full value (mistral base?)
# info about ram requirements
# Initing things
print(f"debug: init model: {model_hf_path}")
#llm = Llama(model_path="./model.bin") # LLaMa model
print("! INITING DONE !")
# Preparing things to work
title = "# Demo for 7B Models - Quantized"
descr = '''
Quantized to run in the free tier hosting.
Have a quick way to test models or share them with others without hassle.
It runs slow, as it's on cpu. Usable for basic tests.
It uses quantized models in gguf-Format and llama.cpp to run them.
Powered by ...'''
print(f"DEBUG: Memory free: {psutil.virtual_memory().free / (1024.0 ** 3)} GiB")
print(f"DEBUG: Memory available: {psutil.virtual_memory().available / (1024.0 ** 3)} GiB")
print(f"DEBUG: Memory: {psutil.virtual_memory().total / (1024.0 ** 3)} GiB")
from threading import Thread
from typing import Iterator
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
DESCRIPTION = f"# Test model: {model_hf_path}"
if torch.cuda.is_available():
DESCRIPTION += "\n<p>This space is using CPU only. Use a different one if you want to go fast and use GPU. </p>"
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
#download model here
# check localstorage, if no there, load, else use existing.
# check gradio - how does it dl? is there a function we can use?
if torch.cuda.is_available():
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_id)
# we need to make sure we only run one thread or we probably run out of ram
def generate(
message: str,
chat_history: list[tuple[str, str]],
max_new_tokens: int = 1024,
temperature: float = 0.6,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.2,
) -> Iterator[str]:
conversation = []
for user, assistant in chat_history:
conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
conversation.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
input_ids = input_ids.to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
{"input_ids": input_ids},
streamer=streamer,
max_new_tokens=max_new_tokens,
do_sample=True,
top_p=top_p,
top_k=top_k,
temperature=temperature,
num_beams=1,
repetition_penalty=repetition_penalty,
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
outputs = []
for text in streamer:
outputs.append(text)
yield "".join(outputs)
chat_interface = gr.ChatInterface(
fn=generate,
additional_inputs=[
gr.Slider(
label="Max new tokens",
minimum=1,
maximum=MAX_MAX_NEW_TOKENS,
step=1,
value=DEFAULT_MAX_NEW_TOKENS,
),
gr.Slider(
label="Temperature",
minimum=0.1,
maximum=4.0,
step=0.1,
value=0.6,
),
gr.Slider(
label="Top-p (nucleus sampling)",
minimum=0.05,
maximum=1.0,
step=0.05,
value=0.9,
),
gr.Slider(
label="Top-k",
minimum=1,
maximum=1000,
step=1,
value=50,
),
gr.Slider(
label="Repetition penalty",
minimum=1.0,
maximum=2.0,
step=0.05,
value=1.2,
),
],
stop_btn=None,
# add more eval examples, like a long list taken from teknium and others maybe group by type
examples=[
["Hello there! How are you doing?"],
["Can you explain briefly to me what is the Python programming language?"],
["Explain the plot of Cinderella in a sentence."],
["How many hours does it take a man to eat a Helicopter?"],
["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
],
)
with gr.Blocks(css="style.css") as demo:
gr.Markdown(title)
gr.Markdown(descr)
gr.DuplicateButton(
value="Duplicate Space for private use",
elem_id="duplicate-button",
visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
# add
)
chat_interface.render()
if __name__ == "__main__":
demo.queue(max_size=20).launch() |