import os import spaces import torch import gradio as gr from huggingface_hub import snapshot_download, login from transformers.utils import move_cache LLM_MODEL_DIR = '/model' LLM_MODEL_ID = "mistral-community/Mistral-7B-v0.2" LLM_MODEL_REVISION = 'main' os.makedirs(LLM_MODEL_DIR, exist_ok=True) snapshot_download(LLM_MODEL_ID, revision=LLM_MODEL_REVISION, local_dir=LLM_MODEL_DIR) #, token=HF_TOKEN) move_cache() # cpu zero = torch.Tensor([0]).cuda() print(zero.device) # <-- 'cpu' 🤔 # gpu @spaces.GPU def greet(user): # print(zero.device) # <-- 'cuda:0' 🤗 from vllm import SamplingParams, LLM model = LLM(LLM_MODEL_DIR) sampling_params = dict( temperature = 0.3, ignore_eos = False, max_tokens = int(512 * 2) ) sampling_params = SamplingParams(**sampling_params) prompts = [user] model_outputs = model.generate(prompts, sampling_params) generations = [] for output in model_outputs: for outputs in output.outputs: generations.append(outputs.text) return generations[0] demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text()) demo.launch()