# import gradio as gr # from transformers import AutoTokenizer, AutoModelForCausalLM # import torch # import spaces # tokenizer = None # model = None # def loadmodel(): # tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16") # model = AutoModelForCausalLM.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16", torch_dtype='auto', device_map='auto') # return tokenizer, model # @spaces.GPU # def generate_text(prompt): # global tokenizer, model # if tokenizer is None or model is None: # tokenizer, model = loadmodel() # inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # outputs = model.generate(inputs.input_ids, max_length=100) # return tokenizer.decode(outputs[0], skip_special_tokens=True) # interface = gr.Interface( # fn=generate_text, # inputs="text", # outputs="text", # title="Meta-Llama-3.1-70B Text Generation", # description="Enter a prompt and generate text using Meta-Llama-3.1-70B.", # ) # interface.launch() import spaces import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import subprocess import os def install_cuda_toolkit(): # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run" CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run" CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL) subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE]) subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE]) subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"]) os.environ["CUDA_HOME"] = "/usr/local/cuda" os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"]) os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % ( os.environ["CUDA_HOME"], "" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"], ) # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6" install_cuda_toolkit() device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16") model = AutoModelForCausalLM.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16", torch_dtype='auto', device_map='auto').to(device) @spaces.GPU def generate_text(prompt): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate(inputs.input_ids, max_length=100) return tokenizer.decode(outputs[0], skip_special_tokens=True) interface = gr.Interface( fn=generate_text, inputs="text", outputs="text", title="Meta-Llama-3.1-70B Text Generation", description="Enter a prompt and generate text using Meta-Llama-3.1-70B.", ) interface.launch()