File size: 2,998 Bytes
e95ad42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136e821
 
 
e95ad42
 
 
 
 
 
 
 
 
 
136e821
e95ad42
 
 
 
 
 
 
 
1c670bf
e95ad42
 
 
 
 
 
136e821
ba8ad86
136e821
e03ccf8
 
 
136e821
 
e03ccf8
 
 
136e821
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# import gradio as gr
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch
# import spaces

# tokenizer = None
# model = None

# def loadmodel():
#     tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16")
#     model = AutoModelForCausalLM.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16", torch_dtype='auto', device_map='auto')
#     return tokenizer, model

# @spaces.GPU
# def generate_text(prompt):
#     global tokenizer, model
#     if tokenizer is None or model is None:
#         tokenizer, model = loadmodel()
    
#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
#     outputs = model.generate(inputs.input_ids, max_length=100)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# interface = gr.Interface(
#     fn=generate_text,
#     inputs="text",
#     outputs="text",
#     title="Meta-Llama-3.1-70B Text Generation",
#     description="Enter a prompt and generate text using Meta-Llama-3.1-70B.",
# )

# interface.launch()

import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import subprocess
import os

def install_cuda_toolkit():
    # CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run"
    CUDA_TOOLKIT_URL = "https://developer.download.nvidia.com/compute/cuda/12.2.0/local_installers/cuda_12.2.0_535.54.03_linux.run"
    CUDA_TOOLKIT_FILE = "/tmp/%s" % os.path.basename(CUDA_TOOLKIT_URL)
    subprocess.call(["wget", "-q", CUDA_TOOLKIT_URL, "-O", CUDA_TOOLKIT_FILE])
    subprocess.call(["chmod", "+x", CUDA_TOOLKIT_FILE])
    subprocess.call([CUDA_TOOLKIT_FILE, "--silent", "--toolkit"])

    os.environ["CUDA_HOME"] = "/usr/local/cuda"
    os.environ["PATH"] = "%s/bin:%s" % (os.environ["CUDA_HOME"], os.environ["PATH"])
    os.environ["LD_LIBRARY_PATH"] = "%s/lib:%s" % (
        os.environ["CUDA_HOME"],
        "" if "LD_LIBRARY_PATH" not in os.environ else os.environ["LD_LIBRARY_PATH"],
    )
    # Fix: arch_list[-1] += '+PTX'; IndexError: list index out of range
    os.environ["TORCH_CUDA_ARCH_LIST"] = "8.0;8.6"

install_cuda_toolkit()

device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16")
model = AutoModelForCausalLM.from_pretrained("ISTA-DASLab/Meta-Llama-3.1-70B-AQLM-PV-2Bit-1x16", torch_dtype='auto', device_map='auto').to(device)

@spaces.GPU
def generate_text(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(inputs.input_ids, max_length=100)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

interface = gr.Interface(
    fn=generate_text,
    inputs="text",
    outputs="text",
    title="Meta-Llama-3.1-70B Text Generation",
    description="Enter a prompt and generate text using Meta-Llama-3.1-70B.",
)

interface.launch()