import time import datetime import streamlit as st question = "Name the planets in the solar system? A: " question = "Quais são os planetas do sistema solar?" question = "Qual é o maior planeta do sistema solar?" before = datetime.datetime.now() import torch from transformers import AutoModelForCausalLM, AutoTokenizer model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4" prompt = [ {"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": question}, ] tokenizer = AutoTokenizer.from_pretrained(model_id) inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt") #.cuda() model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map="auto", ) outputs = model.generate(inputs, do_sample=True, max_new_tokens=256) response = tokenizer.batch_decode(outputs, skip_special_tokens=True) with st.container(): st.write('\n\n') st.write('LLM-LANAChat\n\n') st.write(response) print('\nsaida gerada.') print('\n\n') after = datetime.datetime.now() current_time = (after - before) # .strftime("%H:%M:%S") print("\nTime Elapsed: ", current_time) st.write("\nTime Elapsed: ", current_time)