Spaces:

sugiv
/

LeetMonkey-8bit-GGUF-Inference

Sleeping

sugiv commited on Sep 9, 2024

Commit

5818248

1 Parent(s): 3de40eb

Enabling share and CORS support

Files changed (1) hide show

app.py CHANGED Viewed

@@ -15,6 +15,10 @@ from datasets import load_dataset
 from fastapi.responses import StreamingResponse
 import random
 # Load the dataset (you might want to do this once at the start of your script)
 dataset = load_dataset("sugiv/leetmonkey_python_dataset")
 train_dataset = dataset["train"]
@@ -35,7 +39,7 @@ REPO_ID = "sugiv/leetmonkey-peft-gguf"
 # Load the model
 model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME, cache_dir="./models")
-llm = Llama(model_path=model_path, n_ctx=1024, n_threads=8, n_gpu_layers=-1)
 logger.info("8-bit model loaded successfully")
 # Generation parameters
@@ -46,8 +50,7 @@ generation_kwargs = {
     "temperature": 0.05,
     "top_k": 10,
     "top_p": 0.9,
-    "repeat_penalty": 1.1,
-    "verbose": False
 }
 def verify_token(token: str) -> bool:

 from fastapi.responses import StreamingResponse
 import random
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 # Load the dataset (you might want to do this once at the start of your script)
 dataset = load_dataset("sugiv/leetmonkey_python_dataset")
 train_dataset = dataset["train"]
 # Load the model
 model_path = hf_hub_download(repo_id=REPO_ID, filename=MODEL_NAME, cache_dir="./models")
+llm = Llama(model_path=model_path, n_ctx=1024, n_threads=8, n_gpu_layers=-1, verbose=False, mlock=True)
 logger.info("8-bit model loaded successfully")
 # Generation parameters
     "temperature": 0.05,
     "top_k": 10,
     "top_p": 0.9,
+    "repeat_penalty": 1.1
 }
 def verify_token(token: str) -> bool: