File size: 1,440 Bytes
674b872
 
b3ed955
 
9c36791
 
d11146e
c0b99e9
674b872
 
a2a6a3e
b3ed955
a2a6a3e
b3ed955
 
 
 
 
 
674b872
b3ed955
 
 
 
 
 
 
 
9804ed3
 
674b872
b3ed955
 
 
 
 
 
 
 
 
 
 
 
674b872
 
 
 
b3ed955
674b872
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import streamlit as st
# Load model directly
# from transformers import AutoModel, AutoModelForCausalLM
# from huggingface_hub import login
import os

access_token = os.getenv('HF_TOKEN3')
login(token = access_token) 

file = 'llama-2-7b.Q5_0.gguf'

from llama_cpp import Llama

llm = Llama(
      model_path="./" + file,
      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
      # seed=1337, # Uncomment to set a specific seed
      # n_ctx=2048, # Uncomment to increase the context window
)

prompt = "Q: Name the planets in the solar system? A: "
output = llm(
      prompt, # Prompt
      max_tokens=32, # Generate up to 32 tokens, set to None to generate up to the end of the context window
      stop=["Q:", "\n"], # Stop generating just before the model would generate a new question
      echo=True # Echo the prompt back in the output
) # Generate a completion, can also call create_completion
print(output)



# NO_GPU = 0
# GPU_LAYERS = 50

# llm = AutoModelForCausalLM.from_pretrained(file, model_type="llama", gpu_layers=NO_GPU)

# # model = AutoModelForCausalLM.from_pretrained("valencar/llamm", 
# #                                   model_file=file, model_type="llama", gpu_layers=NO_GPU)

# # access_token = os.getenv('HF_TOKEN2')
# # login(token = access_token)

# prompt = "AI is going to"

with st.container():
    st.write('\n\n')
    st.write(prompt)
    answer = output
    st.write(answer)
    print(answer)