radames commited on
Commit
c100b91
·
1 Parent(s): 349d9f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -5
app.py CHANGED
@@ -1,10 +1,9 @@
1
  import gradio as gr
2
  import copy
3
  import time
4
- import ctypes # to run on C api directly
5
  import llama_cpp
6
  from llama_cpp import Llama
7
- from huggingface_hub import hf_hub_download # load from huggingfaces
8
 
9
 
10
  llm = Llama(
@@ -13,8 +12,8 @@ llm = Llama(
13
  filename="llama-2-7b-chat.ggmlv3.q5_0.bin",
14
  ),
15
  n_ctx=2048,
16
- n_gpu_layers=500,
17
- ) # download model from hf/ n_ctx=2048 for high ccontext length
18
 
19
  history = []
20
 
@@ -64,7 +63,7 @@ def generate_text(input_text, history):
64
  demo = gr.ChatInterface(
65
  generate_text,
66
  title="llama-cpp-python on GPU",
67
- description="Running LLM with https://github.com/abetlen/llama-cpp-python. btw the text streaming thing was the hardest thing to impliment",
68
  examples=["Hello", "Am I cool?", "Are tomatoes vegetables?"],
69
  cache_examples=True,
70
  retry_btn=None,
 
1
  import gradio as gr
2
  import copy
3
  import time
 
4
  import llama_cpp
5
  from llama_cpp import Llama
6
+ from huggingface_hub import hf_hub_download
7
 
8
 
9
  llm = Llama(
 
12
  filename="llama-2-7b-chat.ggmlv3.q5_0.bin",
13
  ),
14
  n_ctx=2048,
15
+ n_gpu_layers=500, # change n_gpu_layers if you have more or less VRAM
16
+ )
17
 
18
  history = []
19
 
 
63
  demo = gr.ChatInterface(
64
  generate_text,
65
  title="llama-cpp-python on GPU",
66
+ description="Running LLM with https://github.com/abetlen/llama-cpp-python",
67
  examples=["Hello", "Am I cool?", "Are tomatoes vegetables?"],
68
  cache_examples=True,
69
  retry_btn=None,