import streamlit as st
# Load model directly
# from transformers import AutoModel, AutoModelForCausalLM
# from huggingface_hub import login
import os

access_token = os.getenv('HF_TOKEN3')
login(token = access_token) 

file = 'llama-2-7b.Q5_0.gguf'

from llama_cpp import Llama

llm = Llama(
      model_path="./" + file,
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)

prompt = "Q: Name the planets in the solar system? A: "
output = llm(
      prompt, # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)


# NO_GPU = 0
# GPU_LAYERS = 50

# llm = AutoModelForCausalLM.from_pretrained(file, model_type="llama", gpu_layers=NO_GPU)

# # model = AutoModelForCausalLM.from_pretrained("valencar/llamm", 
# #                                   model_file=file, model_type="llama", gpu_layers=NO_GPU)

# # access_token = os.getenv('HF_TOKEN2')
# # login(token = access_token)

# prompt = "AI is going to"

with st.container():
    st.write('\n\n')
    st.write(prompt)
    answer = output
    st.write(answer)
    print(answer)