Smart_LLM

Running on Zero

Daemontatox commited on Dec 22, 2024

Commit

9bab2dc

verified ·

1 Parent(s): a60291d

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -59,10 +59,10 @@ h3 {
 device = "cuda"  # for GPU usage or "cpu" for CPU usage
 quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type="nf4")
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
@@ -215,7 +215,7 @@ with gr.Blocks(css=CSS, theme="soft") as demo:
                 minimum=0,
                 maximum=1,
                 step=0.1,
-                value=0.9,
                 label="Temperature",
                 render=False,
             ),

 device = "cuda"  # for GPU usage or "cpu" for CPU usage
 quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,                            # Use 8-bit instead of 4-bit
+    bnb_8bit_compute_dtype=torch.bfloat16,        # bfloat16 for compute
+    bnb_8bit_use_double_quant=False               # Disable double quantization
+)
 tokenizer = AutoTokenizer.from_pretrained(MODEL)
 model = AutoModelForCausalLM.from_pretrained(
                 minimum=0,
                 maximum=1,
                 step=0.1,
+                value=0.8,
                 label="Temperature",
                 render=False,
             ),