Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -33,6 +33,10 @@ import requests
|
|
| 33 |
|
| 34 |
from huggingface_hub import snapshot_download
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
default_bpw = "4.0bpw"
|
| 37 |
available_models = [
|
| 38 |
"2.5bpw",
|
|
@@ -49,14 +53,18 @@ for model in available_models:
|
|
| 49 |
dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})
|
| 50 |
|
| 51 |
@spaces.GPU(duration=45)
|
| 52 |
-
def run_inference(message, history, model_picked):
|
| 53 |
-
if model_picked
|
| 54 |
model_picked = default_bpw
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
local_dir = dirs[model_picked]
|
| 57 |
|
| 58 |
-
print(message)
|
| 59 |
-
print(history)
|
| 60 |
# Loading only once GPU available
|
| 61 |
config = ExLlamaV2Config(local_dir)
|
| 62 |
config.max_seq_len = 16384
|
|
@@ -141,9 +149,10 @@ The current version of ExllamaV2 running is the dev branch, not the master branc
|
|
| 141 |
|
| 142 |
The model at **4bpw and 16k context size fits in less than 12GB of VRAM**!
|
| 143 |
|
| 144 |
-
The current settings are:
|
|
|
|
| 145 |
- Context Size: 16k tokens
|
| 146 |
-
- Max Output:
|
| 147 |
- Temperature: 0.15
|
| 148 |
|
| 149 |
You can select other quants and experiment!
|
|
@@ -157,5 +166,8 @@ examples = [
|
|
| 157 |
]
|
| 158 |
|
| 159 |
drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
| 161 |
demo.queue().launch()
|
|
|
|
| 33 |
|
| 34 |
from huggingface_hub import snapshot_download
|
| 35 |
|
| 36 |
+
default_temperature = 0.15
|
| 37 |
+
default_max_context = 16384
|
| 38 |
+
default_max_output = 512
|
| 39 |
+
|
| 40 |
default_bpw = "4.0bpw"
|
| 41 |
available_models = [
|
| 42 |
"2.5bpw",
|
|
|
|
| 53 |
dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})
|
| 54 |
|
| 55 |
@spaces.GPU(duration=45)
|
| 56 |
+
def run_inference(message, history, model_picked, temperature, context_size, max_output):
|
| 57 |
+
if not model_picked:
|
| 58 |
model_picked = default_bpw
|
| 59 |
+
if not temperature:
|
| 60 |
+
temperature = default_temperature
|
| 61 |
+
if not context_size:
|
| 62 |
+
context_size = default_max_context
|
| 63 |
+
if not max_output:
|
| 64 |
+
max_output = default_max_output
|
| 65 |
|
| 66 |
local_dir = dirs[model_picked]
|
| 67 |
|
|
|
|
|
|
|
| 68 |
# Loading only once GPU available
|
| 69 |
config = ExLlamaV2Config(local_dir)
|
| 70 |
config.max_seq_len = 16384
|
|
|
|
| 149 |
|
| 150 |
The model at **4bpw and 16k context size fits in less than 12GB of VRAM**!
|
| 151 |
|
| 152 |
+
The current default settings are:
|
| 153 |
+
- Model Quant: 4.0bpw
|
| 154 |
- Context Size: 16k tokens
|
| 155 |
+
- Max Output: 512 tokens
|
| 156 |
- Temperature: 0.15
|
| 157 |
|
| 158 |
You can select other quants and experiment!
|
|
|
|
| 166 |
]
|
| 167 |
|
| 168 |
drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
|
| 169 |
+
temperature_gradio = gradio.Slider(minimum = 0, maximum = 1, label="Temperature", value=default_temperature, step = 0.05)
|
| 170 |
+
context_size_gradio = gradio.Slider(minimum = 256, maximum = 32768, label="Context Size", value=default_max_context, step = 1)
|
| 171 |
+
output_length_gradio = gradio.Slider(minimum = 1, maximum = 4096, label="Max Ouput Length", value=default_max_output, step = 1)
|
| 172 |
+
demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = [drop, temperature_gradio, context_size_gradio, output_length_gradio])
|
| 173 |
demo.queue().launch()
|