winglian commited on
Commit
1dc6c65
·
1 Parent(s): e3ba05b

link model attributions, use config.yml for some of the chat settings, increase context size

Browse files
Files changed (2) hide show
  1. chat.py +3 -3
  2. config.yml +8 -2
chat.py CHANGED
@@ -36,7 +36,7 @@ def chat(history, system_message):
36
  for item in history])
37
 
38
  history[-1][1] = ""
39
- for output in llm(messages, max_tokens=512, stop=["</s>", "<unk>", "### User:"], echo=False, stream=True):
40
  answer = output['choices'][0]['text']
41
  history[-1][1] += answer
42
 
@@ -91,7 +91,7 @@ with blocks:
91
  stop.click(fn=None, inputs=None, outputs=None, cancels=[submit_click_event, message_submit_event], queue=False)
92
 
93
  gr.Markdown(f"""
94
- - This is the {config["repo"]}/{config["file"]} model.
95
  - This Space uses GGML with GPU support, so it can run larger models on smaller GPUs & VRAM quickly.
96
  - This is running on a smaller, shared GPU, so it may take a few seconds to respond.
97
  - [Duplicate the Space](https://huggingface.co/spaces/openaccess-ai-collective/ggml-ui?duplicate=true) to skip the queue and run in a private space or to use your own GGML models.
@@ -99,4 +99,4 @@ with blocks:
99
  - Contribute at [https://github.com/OpenAccess-AI-Collective/ggml-webui](https://github.com/OpenAccess-AI-Collective/ggml-webui)
100
  """)
101
 
102
- blocks.queue(max_size=8, concurrency_count=2).launch(debug=True, server_name="0.0.0.0", server_port=7860)
 
36
  for item in history])
37
 
38
  history[-1][1] = ""
39
+ for output in llm(messages, echo=False, stream=True, **config['chat']):
40
  answer = output['choices'][0]['text']
41
  history[-1][1] += answer
42
 
 
91
  stop.click(fn=None, inputs=None, outputs=None, cancels=[submit_click_event, message_submit_event], queue=False)
92
 
93
  gr.Markdown(f"""
94
+ - This is the [{config["repo"]}](https://huggingface.co/{config["repo"]}) model file [{config["file"]}](https://huggingface.co/{config["repo"]}/blob/main/{config["file"]})
95
  - This Space uses GGML with GPU support, so it can run larger models on smaller GPUs & VRAM quickly.
96
  - This is running on a smaller, shared GPU, so it may take a few seconds to respond.
97
  - [Duplicate the Space](https://huggingface.co/spaces/openaccess-ai-collective/ggml-ui?duplicate=true) to skip the queue and run in a private space or to use your own GGML models.
 
99
  - Contribute at [https://github.com/OpenAccess-AI-Collective/ggml-webui](https://github.com/OpenAccess-AI-Collective/ggml-webui)
100
  """)
101
 
102
+ blocks.queue(max_size=32, concurrency_count=4).launch(debug=True, server_name="0.0.0.0", server_port=7860)
config.yml CHANGED
@@ -4,5 +4,11 @@ file: wizard-vicuna-13B.ggml.q5_1.bin
4
  # if the repo above doesn't include the tokenizer set the base repo it was based on with a valid tokenizer model
5
  base_model: junelee/wizard-vicuna-13b
6
  llama_cpp:
7
- n_ctx: 1024
8
- n_gpu_layers: 40 # llama 13b has 40 layers
 
 
 
 
 
 
 
4
  # if the repo above doesn't include the tokenizer set the base repo it was based on with a valid tokenizer model
5
  base_model: junelee/wizard-vicuna-13b
6
  llama_cpp:
7
+ n_ctx: 2048
8
+ n_gpu_layers: 40 # llama 13b has 40 layers
9
+ chat:
10
+ max_tokens: 1024
11
+ stop:
12
+ - "</s>"
13
+ - "<unk>"
14
+ - "### User:"