Spaces:
Sleeping
Sleeping
File size: 2,238 Bytes
925fb71 8673db4 925fb71 570ff00 4a9d3d0 796a7d2 4a9d3d0 8673db4 53aff72 8673db4 dc21160 8673db4 2ad699d 925fb71 868a097 6b7d4bc 868a097 6b7d4bc 7842d10 6b7d4bc 925fb71 958e70f 925fb71 6b7d4bc b40208a 925fb71 6b7d4bc 7842d10 925fb71 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
import gradio as gr
import huggingface_hub
import os
import spaces
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Qwen2_5_VLForConditionalGeneration
from datasets import load_dataset
huggingface_hub.login(os.getenv('HF_TOKEN'))
peft_model_id = "debisoft/DeepSeek-R1-Distill-Qwen-7B-thinking-function_calling-quant-V0"
#peft_model_id = "debisoft/Qwen2.5-VL-7B-Instruct-thinking-function_calling-quant-V0"
#peft_model_id = "debisoft/Qwen2.5-VL-3B-Instruct-thinking-function_calling-V0"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
device = "auto"
cuda_device = torch.device("cuda")
cpu_device = torch.device("cpu")
config = PeftConfig.from_pretrained(peft_model_id)
#model = Qwen2_5_VLForConditionalGeneration.from_pretrained(config.base_model_name_or_path,
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
quantization_config=bnb_config,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
model.resize_token_embeddings(len(tokenizer))
#tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
#model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
@spaces.GPU
def sentience_check():
peft_model = PeftModel.from_pretrained(model, peft_model_id, device_map="cuda"
#offload_folder = "offload/"
)
#peft_model.to(torch.bfloat16)
peft_model.eval()
#peft_model.to(cuda_device)
inputs = tokenizer("Are you sentient?", return_tensors="pt").to(cuda_device)
with torch.no_grad():
outputs = peft_model.generate(
**inputs, max_new_tokens=3000, pad_token_id = tokenizer.eos_token_id
)
#peft_model.to(cpu_device)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
demo = gr.Interface(fn=sentience_check, inputs=None, outputs=gr.Text())
demo.launch()
|