Spaces:
Sleeping
Sleeping
# import gradio as gr | |
# from transformers import AutoTokenizer, AutoModelForCausalLM | |
# import torch | |
# import spaces | |
# tokenizer = None | |
# model = None | |
# def loadmodel(): | |
# tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16") | |
# model = AutoModelForCausalLM.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16", torch_dtype='auto', device_map='auto') | |
# return tokenizer, model | |
# @spaces.GPU | |
# def generate_text(prompt): | |
# global tokenizer, model | |
# if tokenizer is None or model is None: | |
# tokenizer, model = loadmodel() | |
# inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
# outputs = model.generate(inputs.input_ids, max_length=100) | |
# return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# interface = gr.Interface( | |
# fn=generate_text, | |
# inputs="text", | |
# outputs="text", | |
# title="Meta-Llama-3.1-70B Text Generation", | |
# description="Enter a prompt and generate text using Meta-Llama-3.1-70B.", | |
# ) | |
# interface.launch() | |
import spaces | |
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import subprocess | |
import os | |
def install_cuda_toolkit(): | |
# CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run" | |
CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run" | |
CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL) | |
subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE]) | |
subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE]) | |
subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"]) | |
os.environ["CUDA_HOME"] = "/usr/local/cuda" | |
os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"]) | |
os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % ( | |
os.environ["CUDA_HOME"], | |
"" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"], | |
) | |
# Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range | |
os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6" | |
install_cuda_toolkit() | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16") | |
model = AutoModelForCausalLM.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16", torch_dtype='auto', device_map='auto').to(device) | |
def generate_text(prompt): | |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
outputs = model.generate(inputs.input_ids, max_length=100) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
interface = gr.Interface( | |
fn=generate_text, | |
inputs="text", | |
outputs="text", | |
title="Meta-Llama-3.1-70B Text Generation", | |
description="Enter a prompt and generate text using Meta-Llama-3.1-70B.", | |
) | |
interface.launch() |