Spaces:
Running
on
Zero
Running
on
Zero
Enable speculattive decoding
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import streamlit as st
|
2 |
from llama_cpp import Llama
|
|
|
3 |
from huggingface_hub import hf_hub_download
|
4 |
import os, gc, shutil, re
|
5 |
from itertools import islice
|
@@ -139,13 +140,15 @@ def try_load_model(path):
|
|
139 |
return Llama(
|
140 |
model_path=path,
|
141 |
n_ctx=512, # Reduced context window to save memory
|
142 |
-
n_threads=
|
143 |
n_threads_batch=1,
|
144 |
-
n_batch=
|
145 |
n_gpu_layers=0,
|
146 |
use_mlock=False,
|
147 |
use_mmap=True,
|
148 |
verbose=False,
|
|
|
|
|
149 |
)
|
150 |
except Exception as e:
|
151 |
return str(e)
|
|
|
1 |
import streamlit as st
|
2 |
from llama_cpp import Llama
|
3 |
+
from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
|
4 |
from huggingface_hub import hf_hub_download
|
5 |
import os, gc, shutil, re
|
6 |
from itertools import islice
|
|
|
140 |
return Llama(
|
141 |
model_path=path,
|
142 |
n_ctx=512, # Reduced context window to save memory
|
143 |
+
n_threads=2, # Fewer threads for resource-constrained environments
|
144 |
n_threads_batch=1,
|
145 |
+
n_batch=64, # Lower batch size to conserve memory
|
146 |
n_gpu_layers=0,
|
147 |
use_mlock=False,
|
148 |
use_mmap=True,
|
149 |
verbose=False,
|
150 |
+
logits_all=True,
|
151 |
+
draft_model=LlamaPromptLookupDecoding(num_pred_tokens=2),
|
152 |
)
|
153 |
except Exception as e:
|
154 |
return str(e)
|