ZeppelinCorp
/

Charm_15

Text Generation

Mixture of Experts

text-generation-inference

673_trillion_parameters

Model card Files Files and versions Community

Charm_15 / base_model.safetensors

GeminiFan207's picture

Update base_model.safetensors

10bfb12 verified 25 days ago

1.67 kB

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from safetensors.torch import load_file
	from accelerate import init_empty_weights, load_checkpoint_and_dispatch

	# Specify the model name and safetensors file path
	MODEL_NAME = "mistral-8x7B"
	SAFETENSORS_PATH = "path_to_your_model.safetensors"

	# Load the tokenizer
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	# Initialize an empty model (no weights loaded yet)
	with init_empty_weights():
	model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

	# Load the model weights from the safetensors file
	model_weights = load_file(SAFETENSORS_PATH)

	# Use Hugging Face's `accelerate` to load the model efficiently
	# This allows for sharding and offloading to CPU/disk if needed
	model = load_checkpoint_and_dispatch(
	model,
	SAFETENSORS_PATH,
	device_map="auto", # Automatically handles GPU/CPU offloading
	no_split_module_classes=["MistralLayer"], # Specify layers not to split
	dtype=torch.float16, # Use mixed precision for memory efficiency
	)

	# Move the model to the appropriate device
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)

	# Example usage
	input_text = "Hello, how are you?"
	inputs = tokenizer(input_text, return_tensors="pt").to(device)

	# Generate output with efficient memory usage
	with torch.no_grad():
	outputs = model.generate(
	inputs["input_ids"],
	max_length=50,
	num_return_sequences=1,
	temperature=0.7,
	top_k=50,
	top_p=0.95,
	)

	# Decode and print the output
	generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	print("Generated Text:", generated_text)