Final_Assignment_Template3

Sleeping

bwilkie commited on Jul 22

Commit

a6d2c9e

verified ·

1 Parent(s): aa012a4

Update myagent.py

Files changed (1) hide show

myagent.py CHANGED Viewed

@@ -42,13 +42,16 @@ class BasicAgent:
-# Model configuration
-model_id = "bartowski/Llama-3.2-3B-Instruct-GGUF"
-filename = "Llama-3.2-3B-Instruct-Q4_K_M.gguf"
-torch_dtype = torch.bfloat16
-# Load tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
-model_init = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename, torch_dtype=torch_dtype)
 # Create a wrapper class that matches the expected interface
@@ -59,9 +62,28 @@ class LocalLlamaModel:
         self.device = model.device if hasattr(model, 'device') else 'cpu'
     def generate(self, prompt: str, max_new_tokens=512*10, **kwargs):
-        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
-        output_ids = model.generate(input_ids, max_new_tokens=max_new_tokens)
-        output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
         return output
     def __call__(self, prompt: str, max_new_tokens=512, **kwargs):

+# Load model and tokenizer
+model_id = "LiquidAI/LFM2-1.2B"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype="bfloat16",
+    trust_remote_code=True,
+#    attn_implementation="flash_attention_2" <- uncomment on compatible GPU
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 # Create a wrapper class that matches the expected interface
         self.device = model.device if hasattr(model, 'device') else 'cpu'
     def generate(self, prompt: str, max_new_tokens=512*10, **kwargs):
+        # Generate answer
+        prompt = "What is C. elegans?"
+        input_ids = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            add_generation_prompt=True,
+            return_tensors="pt",
+            tokenize=True,
+        ).to(model.device)
+        output = model.generate(
+            input_ids,
+            do_sample=True,
+            temperature=0.3,
+            min_p=0.15,
+            repetition_penalty=1.05,
+            max_new_tokens=512,
+        )
+        output =tokenizer.decode(output[0], skip_special_tokens=False)
         return output
     def __call__(self, prompt: str, max_new_tokens=512, **kwargs):