jjgomez commited on
Commit
c40e28a
·
1 Parent(s): fca7087

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -8
app.py CHANGED
@@ -1,34 +1,46 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
  import gradio as gr
3
  import torch
4
 
5
 
6
- title = "????AI ChatBot"
7
  description = "A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)"
8
  examples = [["How are you?"]]
 
 
 
 
 
 
 
 
 
9
 
 
 
10
 
11
- tokenizer = AutoTokenizer.from_pretrained("clibrain/Llama-2-13b-ft-instruct-es-gptq-4bit")
12
- model = AutoModelForCausalLM.from_pretrained("clibrain/Llama-2-13b-ft-instruct-es-gptq-4bit")
13
 
14
 
15
  def predict(input, history=[]):
16
  # tokenize the new input sentence
17
  new_user_input_ids = tokenizer.encode(
18
  input + tokenizer.eos_token, return_tensors="pt"
19
- )
20
 
21
  # append the new user input tokens to the chat history
22
- bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
 
23
 
24
  # generate a response
25
  history = model.generate(
26
  bot_input_ids, max_length=4000, pad_token_id=tokenizer.eos_token_id
27
- ).tolist()
28
-
29
  # convert the tokens to text, and then split the responses into lines
30
  response = tokenizer.decode(history[0]).split("<|endoftext|>")
31
  # print('decoded_response-->>'+str(response))
 
32
  response = [
33
  (response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
34
  ] # convert to tuples of list
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
2
  import gradio as gr
3
  import torch
4
 
5
 
6
+ title = "????AI ChatBot bajo GPU"
7
  description = "A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)"
8
  examples = [["How are you?"]]
9
+ model_id="clibrain/Llama-2-13b-ft-instruct-es-gptq-4bit"
10
+ config = AutoConfig.from_pretrained(model_id)
11
+ #config.quantization_config["use_exllama"] = True
12
+ config.quantization_config["disable_exllama"] = True
13
+ config.quantization_config["exllama_config"] = {"version":2}
14
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
15
+ print("********************")
16
+ print(device)
17
+ print("********************")
18
 
19
+ model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, config=config)
20
+ model = model.to(device)
21
 
22
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
 
23
 
24
 
25
  def predict(input, history=[]):
26
  # tokenize the new input sentence
27
  new_user_input_ids = tokenizer.encode(
28
  input + tokenizer.eos_token, return_tensors="pt"
29
+ ).to(device)
30
 
31
  # append the new user input tokens to the chat history
32
+ historygpu=torch.LongTensor(history).to(device)
33
+ bot_input_ids = torch.cat([historygpu, new_user_input_ids], dim=-1)
34
 
35
  # generate a response
36
  history = model.generate(
37
  bot_input_ids, max_length=4000, pad_token_id=tokenizer.eos_token_id
38
+ )
39
+
40
  # convert the tokens to text, and then split the responses into lines
41
  response = tokenizer.decode(history[0]).split("<|endoftext|>")
42
  # print('decoded_response-->>'+str(response))
43
+ print(response)
44
  response = [
45
  (response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
46
  ] # convert to tuples of list