valencar commited on
Commit
a83e000
·
verified ·
1 Parent(s): bda0304

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -35
app.py CHANGED
@@ -9,52 +9,34 @@ question = "Qual é o maior planeta do sistema solar?"
9
 
10
  before = datetime.datetime.now()
11
 
12
- # from transformers.modeling_outputs import Seq2SeqModelOutput, BaseModelOutput
13
 
14
- from mlx_lm import load, generate
 
15
 
16
- model, tokenizer = load("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit")
 
 
 
 
17
 
18
- prompt = "Question: Qual é o maior planeta do sistema solar ?"
19
 
20
- response = generate(model, tokenizer, prompt=prompt, verbose=True)
21
 
22
- # inputs = tokenizer(prompt, return_tensors="pt")
23
- # outputs = model(**inputs) #, labels=inputs["input_ids"])
 
 
 
 
24
 
 
 
25
 
26
- # last_hidden_states = outputs.last_hidden_state
27
 
28
- # output = last_hidden_states #['last_hidden_states']
29
-
30
-
31
- # input_text = "The theory of special relativity states "
32
- # input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
33
-
34
- # XGLMForCausalLM
35
-
36
- # outputs = model(**inputs)
37
- # output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
38
-
39
-
40
- # decoded = tokenizer.decode(output)
41
-
42
- # output = BaseModelOutput(last_hidden_states['last_hidden_states'])
43
-
44
- # logits = last_hidden_states.logits
45
-
46
- # output = last_hidden_states[0][0]
47
-
48
- # decoded = tokenizer.decode(output) # [0][0]
49
- # print(decoded)
50
-
51
- # output = Seq2SeqModelOutput(output)
52
-
53
- # output = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
54
  with st.container():
55
  st.write('\n\n')
56
  st.write('LLM-LANAChat\n\n')
57
- # st.write(outputs)
58
  st.write(response)
59
 
60
  print('\nsaida gerada.')
 
9
 
10
  before = datetime.datetime.now()
11
 
 
12
 
13
+ import torch
14
+ from transformers import AutoModelForCausalLM, AutoTokenizer
15
 
16
+ model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4"
17
+ prompt = [
18
+ {"role": "system", "content": "You are a helpful assistant"},
19
+ {"role": "user", "content": question},
20
+ ]
21
 
22
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
23
 
24
+ inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
25
 
26
+ model = AutoModelForCausalLM.from_pretrained(
27
+ model_id,
28
+ torch_dtype=torch.bfloat16,
29
+ low_cpu_mem_usage=True,
30
+ device_map="auto",
31
+ )
32
 
33
+ outputs = model.generate(inputs, do_sample=True, max_new_tokens=256)
34
+ response = tokenizer.batch_decode(outputs, skip_special_tokens=True)
35
 
 
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  with st.container():
38
  st.write('\n\n')
39
  st.write('LLM-LANAChat\n\n')
 
40
  st.write(response)
41
 
42
  print('\nsaida gerada.')