Luigi commited on
Commit
a7fdfe6
·
1 Parent(s): 06a162a

Enable speculattive decoding

Browse files
Files changed (1) hide show
  1. app.py +5 -2
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import streamlit as st
2
  from llama_cpp import Llama
 
3
  from huggingface_hub import hf_hub_download
4
  import os, gc, shutil, re
5
  from itertools import islice
@@ -139,13 +140,15 @@ def try_load_model(path):
139
  return Llama(
140
  model_path=path,
141
  n_ctx=512, # Reduced context window to save memory
142
- n_threads=1, # Fewer threads for resource-constrained environments
143
  n_threads_batch=1,
144
- n_batch=2, # Lower batch size to conserve memory
145
  n_gpu_layers=0,
146
  use_mlock=False,
147
  use_mmap=True,
148
  verbose=False,
 
 
149
  )
150
  except Exception as e:
151
  return str(e)
 
1
  import streamlit as st
2
  from llama_cpp import Llama
3
+ from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
4
  from huggingface_hub import hf_hub_download
5
  import os, gc, shutil, re
6
  from itertools import islice
 
140
  return Llama(
141
  model_path=path,
142
  n_ctx=512, # Reduced context window to save memory
143
+ n_threads=2, # Fewer threads for resource-constrained environments
144
  n_threads_batch=1,
145
+ n_batch=64, # Lower batch size to conserve memory
146
  n_gpu_layers=0,
147
  use_mlock=False,
148
  use_mmap=True,
149
  verbose=False,
150
+ logits_all=True,
151
+ draft_model=LlamaPromptLookupDecoding(num_pred_tokens=2),
152
  )
153
  except Exception as e:
154
  return str(e)