File size: 1,440 Bytes
674b872 b3ed955 9c36791 d11146e c0b99e9 674b872 a2a6a3e b3ed955 a2a6a3e b3ed955 674b872 b3ed955 9804ed3 674b872 b3ed955 674b872 b3ed955 674b872 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import streamlit as st
# Load model directly
# from transformers import AutoModel, AutoModelForCausalLM
# from huggingface_hub import login
import os
access_token = os.getenv('HF_TOKEN3')
login(token = access_token)
file = 'llama-2-7b.Q5_0.gguf'
from llama_cpp import Llama
llm = Llama(
model_path="./" + file,
# n_gpu_layers=-1, # Uncomment to use GPU acceleration
# seed=1337, # Uncomment to set a specific seed
# n_ctx=2048, # Uncomment to increase the context window
)
prompt = "Q: Name the planets in the solar system? A: "
output = llm(
prompt, # Prompt
max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)
# NO_GPU = 0
# GPU_LAYERS = 50
# llm = AutoModelForCausalLM.from_pretrained(file, model_type="llama", gpu_layers=NO_GPU)
# # model = AutoModelForCausalLM.from_pretrained("valencar/llamm",
# # model_file=file, model_type="llama", gpu_layers=NO_GPU)
# # access_token = os.getenv('HF_TOKEN2')
# # login(token = access_token)
# prompt = "AI is going to"
with st.container():
st.write('\n\n')
st.write(prompt)
answer = output
st.write(answer)
print(answer) |