This PEFT model was trained with unsloth/meta-llama-3.1-8b-instruct-bnb-4bit using GRPO with the openai/gsm8k dataset.
import time
from unsloth import FastLanguageModel, PatchFastRL
PatchFastRL("GRPO", FastLanguageModel)
from unsloth import is_bfloat16_supported
import torch
# Define model parameters
max_seq_length = 13000 # Increase for longer contexts if needed
lora_rank = 32 # Larger rank = potentially better performance but slower inference
# Define system prompt and response format
SYSTEM_PROMPT = """
Respond in the following format:
<reasoning>
...
</reasoning>
<answer>
...
</answer>
"""
# Load the base model in 16-bit mode (required for LoRA with PEFT)
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="meta-llama/Llama-3.1-8B-Instruct",
max_seq_length=max_seq_length,
load_in_4bit=True, # Use 16-bit mode for LoRA support
fast_inference=True, # Enable vLLM fast inference
max_lora_rank=lora_rank,
gpu_memory_utilization=0.6, # Adjust if you run into memory issues
)
# Load the LoRA weights using PEFT
from peft import PeftModel
model = PeftModel.from_pretrained(model, "miike-ai/Llama-3.1-8b-gsm8k-r")
# Import sampling parameters from vLLM
from vllm import SamplingParams
print("Model fully loaded into memory. Ready to chat!")
print("Type 'exit' or 'quit' to stop.\n")
# Interactive chat loop
while True:
try:
user_input = input("User: ")
except KeyboardInterrupt:
print("\nExiting...")
break
# Exit the chat loop if the user types 'exit' or 'quit'
if user_input.strip().lower() in {"exit", "quit"}:
print("Exiting...")
break
# Prepare the prompt with the system prompt and user input
prompt = tokenizer.apply_chat_template(
[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_input},
],
tokenize=False,
add_generation_prompt=True
)
# Set sampling parameters for generation
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=1024
)
# Stream the response
start_time = time.time()
print("\nAssistant: ", end="", flush=True)
# Generate initial response
outputs = model.fast_generate(prompt, sampling_params=sampling_params)
response_text = outputs[0].outputs[0].text
# Stream the response token by token
current_text = ""
for i in range(len(response_text)):
new_token = response_text[i]
print(new_token, end="", flush=True)
current_text += new_token
time.sleep(0.02) # Small delay for readability
inference_time = time.time() - start_time
print(f"\nInference time: {inference_time:.2f} seconds\n")
Inference Providers
NEW
This model is not currently available via any of the supported Inference Providers.
The model cannot be deployed to the HF Inference API:
The model has no library tag.
Model tree for miike-ai/Llama-3.1-8b-gsm8k-r
Base model
meta-llama/Llama-3.1-8B
Finetuned
meta-llama/Llama-3.1-8B-Instruct