|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer |
|
from safetensors.torch import load_file |
|
from accelerate import init_empty_weights, load_checkpoint_and_dispatch |
|
|
|
|
|
MODEL_NAME = "mistral-8x7B" |
|
SAFETENSORS_PATH = "path_to_your_model.safetensors" |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
|
|
|
with init_empty_weights(): |
|
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME) |
|
|
|
|
|
model_weights = load_file(SAFETENSORS_PATH) |
|
|
|
|
|
|
|
model = load_checkpoint_and_dispatch( |
|
model, |
|
SAFETENSORS_PATH, |
|
device_map="auto", # Automatically handles GPU/CPU offloading |
|
no_split_module_classes=["MistralLayer"], # Specify layers not to split |
|
dtype=torch.float16, # Use mixed precision for memory efficiency |
|
) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device) |
|
|
|
|
|
input_text = "Hello, how are you?" |
|
inputs = tokenizer(input_text, return_tensors="pt").to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
inputs["input_ids"], |
|
max_length=50, |
|
num_return_sequences=1, |
|
temperature=0.7, |
|
top_k=50, |
|
top_p=0.95, |
|
) |
|
|
|
|
|
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
print("Generated Text:", generated_text) |