pandora-s commited on
Commit
a6a707a
·
verified ·
1 Parent(s): b26fd1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -7
app.py CHANGED
@@ -33,6 +33,10 @@ import requests
33
 
34
  from huggingface_hub import snapshot_download
35
 
 
 
 
 
36
  default_bpw = "4.0bpw"
37
  available_models = [
38
  "2.5bpw",
@@ -49,14 +53,18 @@ for model in available_models:
49
  dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})
50
 
51
  @spaces.GPU(duration=45)
52
- def run_inference(message, history, model_picked):
53
- if model_picked == None:
54
  model_picked = default_bpw
 
 
 
 
 
 
55
 
56
  local_dir = dirs[model_picked]
57
 
58
- print(message)
59
- print(history)
60
  # Loading only once GPU available
61
  config = ExLlamaV2Config(local_dir)
62
  config.max_seq_len = 16384
@@ -141,9 +149,10 @@ The current version of ExllamaV2 running is the dev branch, not the master branc
141
 
142
  The model at **4bpw and 16k context size fits in less than 12GB of VRAM**!
143
 
144
- The current settings are:
 
145
  - Context Size: 16k tokens
146
- - Max Output: 1024 tokens
147
  - Temperature: 0.15
148
 
149
  You can select other quants and experiment!
@@ -157,5 +166,8 @@ examples = [
157
  ]
158
 
159
  drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
160
- demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = drop)
 
 
 
161
  demo.queue().launch()
 
33
 
34
  from huggingface_hub import snapshot_download
35
 
36
+ default_temperature = 0.15
37
+ default_max_context = 16384
38
+ default_max_output = 512
39
+
40
  default_bpw = "4.0bpw"
41
  available_models = [
42
  "2.5bpw",
 
53
  dirs.update({model: snapshot_download(repo_id="turboderp/pixtral-12b-exl2", revision=model)})
54
 
55
  @spaces.GPU(duration=45)
56
+ def run_inference(message, history, model_picked, temperature, context_size, max_output):
57
+ if not model_picked:
58
  model_picked = default_bpw
59
+ if not temperature:
60
+ temperature = default_temperature
61
+ if not context_size:
62
+ context_size = default_max_context
63
+ if not max_output:
64
+ max_output = default_max_output
65
 
66
  local_dir = dirs[model_picked]
67
 
 
 
68
  # Loading only once GPU available
69
  config = ExLlamaV2Config(local_dir)
70
  config.max_seq_len = 16384
 
149
 
150
  The model at **4bpw and 16k context size fits in less than 12GB of VRAM**!
151
 
152
+ The current default settings are:
153
+ - Model Quant: 4.0bpw
154
  - Context Size: 16k tokens
155
+ - Max Output: 512 tokens
156
  - Temperature: 0.15
157
 
158
  You can select other quants and experiment!
 
166
  ]
167
 
168
  drop = gr.Dropdown(available_models, label="EXL2 Quant", value=default_bpw)
169
+ temperature_gradio = gradio.Slider(minimum = 0, maximum = 1, label="Temperature", value=default_temperature, step = 0.05)
170
+ context_size_gradio = gradio.Slider(minimum = 256, maximum = 32768, label="Context Size", value=default_max_context, step = 1)
171
+ output_length_gradio = gradio.Slider(minimum = 1, maximum = 4096, label="Max Ouput Length", value=default_max_output, step = 1)
172
+ demo = gr.ChatInterface(fn=run_inference, examples = examples, title="Pixtral 12B EXL2", multimodal=True, description=description, additional_inputs = [drop, temperature_gradio, context_size_gradio, output_length_gradio])
173
  demo.queue().launch()