BEE-spoke-data
/

smol_llama-101M-GQA-python

+<details>
+<summary>🔥 Unleash the Power of Code Generation! Click to Reveal the Magic! 🔮</summary>
+Are you ready to witness the incredible possibilities of code generation? 🚀. Brace yourself for an exceptional journey into the world of artificial intelligence and programming. Observe a script that will change the way you create and finalize code.
+This script provides entry to a planet where machines can write code with remarkable precision and imagination.
+```python
+"""
+simple script for testing model(s) designed to generate/complete code
+See details/args with the below.
+    python textgen_inference_code.py --help
+"""
+import logging
+import random
+import time
+from pathlib import Path
+import fire
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+logging.basicConfig(format="%(levelname)s - %(message)s", level=logging.INFO)
+class Timer:
+    """
+    Basic timer utility.
+    """
+    def __enter__(self):
+        self.start_time = time.perf_counter()
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.end_time = time.perf_counter()
+        self.elapsed_time = self.end_time - self.start_time
+        logging.info(f"Elapsed time: {self.elapsed_time:.4f} seconds")
+def load_model(model_name, use_fast=False):
+    """ util for loading model and tokenizer"""
+    logging.info(f"Loading model: {model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=use_fast)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype="auto", device_map="auto"
+    )
+    model = torch.compile(model)
+    return tokenizer, model
+def run_inference(prompt, model, tokenizer, max_new_tokens: int = 256):
+    """
+    run_inference
+    Args:
+        prompt (TYPE): Description
+        model (TYPE): Description
+        tokenizer (TYPE): Description
+        max_new_tokens (int, optional): Description
+    Returns:
+        TYPE: Description
+    """
+    logging.info(f"Running inference with max_new_tokens={max_new_tokens} ...")
+    with Timer() as timer:
+        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=8,
+            renormalize_logits=True,
+            no_repeat_ngram_size=8,
+            repetition_penalty=1.04,
+            num_beams=4,
+            early_stopping=True,
+        )
+    text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
+    logging.info(f"Output text:\n\n{text}")
+    return text
+def main(
+    model_name="BEE-spoke-data/smol_llama-101M-GQA-python",
+    prompt:str=None,
+    use_fast=False,
+    n_tokens: int = 256,
+):
+    """Summary
+    Args:
+        model_name (str, optional): Description
+        prompt (None, optional): specify the prompt directly (default: random choice from list)
+        n_tokens (int, optional): max new tokens to generate
+    """
+    logging.info(f"Inference with:\t{model_name}, max_new_tokens:{n_tokens}")
+    if prompt is None:
+        prompt_list = [
+            '''
+            def print_primes(n: int):
+               """
+               Print all primes between 1 and n
+               """''',
+            "def quantum_analysis(",
+            "def sanitize_filenames(target_dir:str, recursive:False, extension",
+        ]
+        prompt = random.SystemRandom().choice(prompt_list)
+    logging.info(f"Using prompt:\t{prompt}")
+    tokenizer, model = load_model(model_name, use_fast=use_fast)
+    run_inference(prompt, model, tokenizer, n_tokens)
+if __name__ == "__main__":
+    fire.Fire(main)
+```
+Wowoweewa!! It can create some file cleaning utilities.
+</details>
+---