Spaces:
Sleeping
Sleeping
import os | |
import spaces | |
import torch | |
import gradio as gr | |
from huggingface_hub import snapshot_download, login | |
from transformers.utils import move_cache | |
LLM_MODEL_DIR = '/model' | |
LLM_MODEL_ID = "mistral-community/Mistral-7B-v0.2" | |
LLM_MODEL_REVISION = 'main' | |
os.makedirs(LLM_MODEL_DIR, exist_ok=True) | |
snapshot_download(LLM_MODEL_ID, revision=LLM_MODEL_REVISION, local_dir=LLM_MODEL_DIR) #, token=HF_TOKEN) | |
move_cache() | |
# cpu | |
zero = torch.Tensor([0]).cuda() | |
print(zero.device) # <-- 'cpu' 🤔 | |
# gpu | |
def greet(user): | |
# print(zero.device) # <-- 'cuda:0' 🤗 | |
from vllm import SamplingParams, LLM | |
model = LLM(LLM_MODEL_DIR) | |
sampling_params = dict( | |
temperature = 0.3, | |
ignore_eos = False, | |
max_tokens = int(512 * 2) | |
) | |
sampling_params = SamplingParams(**sampling_params) | |
prompts = [user] | |
model_outputs = model.generate(prompts, sampling_params) | |
generations = [] | |
for output in model_outputs: | |
for outputs in output.outputs: | |
generations.append(outputs.text) | |
return generations[0] | |
demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text()) | |
demo.launch() |