import streamlit as st # Load model directly # from transformers import AutoModel, AutoModelForCausalLM # from huggingface_hub import login import os access_token = os.getenv('HF_TOKEN3') login(token = access_token) file = 'llama-2-7b.Q5_0.gguf' from llama_cpp import Llama llm = Llama( model_path="./" + file, # n_gpu_layers=-1, # Uncomment to use GPU acceleration # seed=1337, # Uncomment to set a specific seed # n_ctx=2048, # Uncomment to increase the context window ) prompt = "Q: Name the planets in the solar system? A: " output = llm( prompt, # Prompt max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window stop=["Q:", "\n"], # Stop generating just before the model would generate a new question echo=True # Echo the prompt back in the output ) # Generate a completion, can also call create_completion print(output) # NO_GPU = 0 # GPU_LAYERS = 50 # llm = AutoModelForCausalLM.from_pretrained(file, model_type="llama", gpu_layers=NO_GPU) # # model = AutoModelForCausalLM.from_pretrained("valencar/llamm", # # model_file=file, model_type="llama", gpu_layers=NO_GPU) # # access_token = os.getenv('HF_TOKEN2') # # login(token = access_token) # prompt = "AI is going to" with st.container(): st.write('\n\n') st.write(prompt) answer = output st.write(answer) print(answer)