This PEFT model was trained with unsloth/meta-llama-3.1-8b-instruct-bnb-4bit using GRPO with the openai/gsm8k dataset.


import time
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)
from unsloth import is_bfloat16_supported
import torch

# Define model parameters
max_seq_length = 13000  # Increase for longer contexts if needed
lora_rank = 32        # Larger rank = potentially better performance but slower inference

# Define system prompt and response format
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""

# Load the base model in 16-bit mode (required for LoRA with PEFT)
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="meta-llama/Llama-3.1-8B-Instruct",
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # Use 16-bit mode for LoRA support
    fast_inference=True,  # Enable vLLM fast inference
    max_lora_rank=lora_rank,
    gpu_memory_utilization=0.6,  # Adjust if you run into memory issues
)

# Load the LoRA weights using PEFT
from peft import PeftModel
model = PeftModel.from_pretrained(model, "miike-ai/Llama-3.1-8b-gsm8k-r")

# Import sampling parameters from vLLM
from vllm import SamplingParams

print("Model fully loaded into memory. Ready to chat!")
print("Type 'exit' or 'quit' to stop.\n")

# Interactive chat loop
while True:
    try:
        user_input = input("User: ")
    except KeyboardInterrupt:
        print("\nExiting...")
        break

    # Exit the chat loop if the user types 'exit' or 'quit'
    if user_input.strip().lower() in {"exit", "quit"}:
        print("Exiting...")
        break

    # Prepare the prompt with the system prompt and user input
    prompt = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": user_input},
        ],
        tokenize=False,
        add_generation_prompt=True
    )

    # Set sampling parameters for generation
    sampling_params = SamplingParams(
        temperature=0.8,
        top_p=0.95,
        max_tokens=1024
    )

    # Stream the response
    start_time = time.time()
    print("\nAssistant: ", end="", flush=True)
    
    # Generate initial response
    outputs = model.fast_generate(prompt, sampling_params=sampling_params)
    response_text = outputs[0].outputs[0].text
    
    # Stream the response token by token
    current_text = ""
    for i in range(len(response_text)):
        new_token = response_text[i]
        print(new_token, end="", flush=True)
        current_text += new_token
        time.sleep(0.02)  # Small delay for readability
    
    inference_time = time.time() - start_time
    print(f"\nInference time: {inference_time:.2f} seconds\n")
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API: The model has no library tag.

Model tree for miike-ai/Llama-3.1-8b-gsm8k-r

Dataset used to train miike-ai/Llama-3.1-8b-gsm8k-r