Change of settings again for TPU
Browse files
app.py
CHANGED
@@ -44,7 +44,7 @@ REPO_ID = "sugiv/leetmonkey-peft-gguf"
|
|
44 |
|
45 |
# Load the model
|
46 |
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME, cache_dir="./models")
|
47 |
-
llm = Llama(model_path=model_path, n_ctx=
|
48 |
#llm = Llama(model_path=model_path, n_ctx=1024, n_threads=16, n_gpu_layers=0, verbose=False, mlock=False) ## CPU only
|
49 |
#llm = Llama(model_path=model_path, n_ctx=1024, n_threads=16, n_gpu_layers=10, verbose=False, mlock=False) ## Nvidia
|
50 |
logger.info("8-bit model loaded successfully")
|
@@ -54,7 +54,7 @@ token_to_problem_solution = {}
|
|
54 |
|
55 |
# Generation parameters
|
56 |
generation_kwargs = {
|
57 |
-
"max_tokens":
|
58 |
"stop": ["```", "### Instruction:", "### Response:"],
|
59 |
"echo": False,
|
60 |
"temperature": 0.05,
|
|
|
44 |
|
45 |
# Load the model
|
46 |
model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME, cache_dir="./models")
|
47 |
+
llm = Llama(model_path=model_path, n_ctx=1024, n_threads=8, n_gpu_layers=-1, verbose=False, mlock=True) ## TPU
|
48 |
#llm = Llama(model_path=model_path, n_ctx=1024, n_threads=16, n_gpu_layers=0, verbose=False, mlock=False) ## CPU only
|
49 |
#llm = Llama(model_path=model_path, n_ctx=1024, n_threads=16, n_gpu_layers=10, verbose=False, mlock=False) ## Nvidia
|
50 |
logger.info("8-bit model loaded successfully")
|
|
|
54 |
|
55 |
# Generation parameters
|
56 |
generation_kwargs = {
|
57 |
+
"max_tokens": 512,
|
58 |
"stop": ["```", "### Instruction:", "### Response:"],
|
59 |
"echo": False,
|
60 |
"temperature": 0.05,
|