Spaces:

KwabsHug
/

TestCompressedModelzero

Running on Zero

kwabs22

CUDA location is probably zero issue

1c670bf about 1 year ago

1.31 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	import spaces

	# Load model and tokenizer

	tokenizer = None
	model = None

	def loadmodel():
	global tokenizer, model
	tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16")
	model = AutoModelForCausalLM.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16", torch_dtype=torch.float16, device_map= 'auto')
	#model = model.to('cuda') # Move the model to GPU if available
	pass

	# Define a function for generating text from a prompt
	@spaces.GPU
	def generate_text(prompt):
	global tokenizer, model
	inputs = tokenizer(prompt, return_tensors="pt").to('cuda') # Tokenize input and move to GPU
	outputs = model.generate(inputs.input_ids, max_length=100) # Generate output text
	return tokenizer.decode(outputs[0], skip_special_tokens=True) # Decode and return the text

	# Create Gradio Interface
	interface = gr.Interface(
	fn=generate_text, # Function that handles text generation
	inputs="text", # Input is a text box
	outputs="text", # Output is a text box
	title="Meta-Llama-3.1-70B Text Generation",
	description="Enter a prompt and generate text using Meta-Llama-3.1-70B.",
	)

	# Launch the Gradio app
	interface.launch()