Spaces:

jjgomez
/

UCMBot

Sleeping

App Files Files Community

jjgomez commited on Jan 1, 2024

Commit

c40e28a

1 Parent(s): fca7087

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -8

app.py CHANGED Viewed

@@ -1,34 +1,46 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import gradio as gr
 import torch
-title = "????AI ChatBot"
 description = "A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)"
 examples = [["How are you?"]]
-tokenizer = AutoTokenizer.from_pretrained("clibrain/Llama-2-13b-ft-instruct-es-gptq-4bit")
-model = AutoModelForCausalLM.from_pretrained("clibrain/Llama-2-13b-ft-instruct-es-gptq-4bit")
 def predict(input, history=[]):
     # tokenize the new input sentence
     new_user_input_ids = tokenizer.encode(
         input + tokenizer.eos_token, return_tensors="pt"
-    )
     # append the new user input tokens to the chat history
-    bot_input_ids = torch.cat([torch.LongTensor(history), new_user_input_ids], dim=-1)
     # generate a response
     history = model.generate(
         bot_input_ids, max_length=4000, pad_token_id=tokenizer.eos_token_id
-    ).tolist()
     # convert the tokens to text, and then split the responses into lines
     response = tokenizer.decode(history[0]).split("<|endoftext|>")
     # print('decoded_response-->>'+str(response))
     response = [
         (response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
     ]  # convert to tuples of list

+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import gradio as gr
 import torch
+title = "????AI ChatBot bajo GPU"
 description = "A State-of-the-Art Large-scale Pretrained Response generation model (DialoGPT)"
 examples = [["How are you?"]]
+model_id="clibrain/Llama-2-13b-ft-instruct-es-gptq-4bit"
+config = AutoConfig.from_pretrained(model_id)
+#config.quantization_config["use_exllama"] = True
+config.quantization_config["disable_exllama"] = True
+config.quantization_config["exllama_config"] = {"version":2}
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print("********************")
+print(device)
+print("********************")
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, config=config)
+model = model.to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 def predict(input, history=[]):
     # tokenize the new input sentence
     new_user_input_ids = tokenizer.encode(
         input + tokenizer.eos_token, return_tensors="pt"
+    ).to(device)
     # append the new user input tokens to the chat history
+    historygpu=torch.LongTensor(history).to(device)
+    bot_input_ids = torch.cat([historygpu, new_user_input_ids], dim=-1)
     # generate a response
     history = model.generate(
         bot_input_ids, max_length=4000, pad_token_id=tokenizer.eos_token_id
+    )
     # convert the tokens to text, and then split the responses into lines
     response = tokenizer.decode(history[0]).split("<|endoftext|>")
     # print('decoded_response-->>'+str(response))
+    print(response)
     response = [
         (response[i], response[i + 1]) for i in range(0, len(response) - 1, 2)
     ]  # convert to tuples of list