Chris STC commited on
Commit
44d3d18
·
1 Parent(s): 8532aae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -30
app.py CHANGED
@@ -1,37 +1,46 @@
1
- import torch
2
- from transformers import BitsAndBytesConfig
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
 
4
 
5
- quantization_config = BitsAndBytesConfig(
6
- load_in_4bit=True,
7
- bnb_4bit_compute_dtype=torch.float16,
8
- bnb_4bit_quant_type="nf4",
9
- bnb_4bit_use_double_quant=True,
10
- )
11
 
12
- # My version with smaller chunks on safetensors for low RAM environments
13
- model_id = "vilsonrodrigues/falcon-7b-instruct-sharded"
14
 
15
- model_4bit = AutoModelForCausalLM.from_pretrained(
16
- model_id,
17
- device_map="auto",
18
- quantization_config=quantization_config,
19
- trust_remote_code=True)
 
 
 
 
 
 
20
 
21
- tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
22
 
23
- pipeline = pipeline(
24
- "text-generation",
25
- model=model_4bit,
26
- tokenizer=tokenizer,
27
- use_cache=True,
28
- device_map="auto",
29
- max_length=296,
30
- do_sample=True,
31
- top_k=10,
32
- num_return_sequences=1,
33
- eos_token_id=tokenizer.eos_token_id,
34
- pad_token_id=tokenizer.eos_token_id,
35
  )
36
 
37
- print(pipeline("Hello"))
 
1
+ import gradio as gr
2
+ import os
3
+ import wget
4
+ from llama_cpp import Llama
5
+ import random
6
 
7
+ os.system('CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python')
8
+
9
+ url = 'https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML/resolve/main/WizardLM-7B-uncensored.ggmlv3.q2_K.bin'
10
+ filename = wget.download(url)
11
+ llm2 = Llama(model_path=filename, seed=random.randint(1, 2**31))
 
12
 
13
+ title = """<h1 align="center">Chat with awesome WizardLM 7b model!</h1><br>"""
14
+ description = "This model is awesome for its size! It is only 20th the size of Chatgpt but is around 90% as good as Chatgpt. However, please don't rely on WizardLM to provide 100% true information as it might be wrong sometimes."
15
 
16
+ def bot(user_message, temperature, top_p, repeat_penalty):
17
+ tokens3 = llm2.tokenize(user_message.encode())
18
+ token4 = llm2.tokenize(b"\n\n### Response:")
19
+ tokens = tokens3 + token4
20
+
21
+ output = ""
22
+ for token in llm2.generate(tokens, top_k=50, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
23
+ text = llm2.detokenize([token])
24
+ output += text.decode()
25
+ if token == llm2.token_eos():
26
+ break
27
 
28
+ # Removing other parts of the conversation, we just want the bot's response
29
+ response_start_idx = output.find("### Response:") + len("### Response:")
30
+ return output[response_start_idx:].strip()
31
 
32
+ interface = gr.Interface(
33
+ fn=bot,
34
+ inputs=[
35
+ gr.Textbox(label="Your Message", placeholder="Type your message here..."),
36
+ gr.Slider(minimum=0, maximum=2, default=1, label="Temperature"),
37
+ gr.Slider(minimum=0, maximum=1, default=0.73, label="Top P"),
38
+ gr.Slider(minimum=0, maximum=2, default=1.1, label="Repeat Penalty")
39
+ ],
40
+ outputs="text",
41
+ live=True,
42
+ description=description,
43
+ title=title
44
  )
45
 
46
+ interface.launch(debug=True)