File size: 2,238 Bytes
925fb71
 
 
 
 
 
8673db4
 
 
925fb71
570ff00
4a9d3d0
796a7d2
4a9d3d0
8673db4
 
 
 
 
 
 
 
 
53aff72
 
 
8673db4
dc21160
 
8673db4
 
 
 
 
 
 
 
 
2ad699d
925fb71
 
868a097
6b7d4bc
 
 
868a097
6b7d4bc
7842d10
6b7d4bc
925fb71
958e70f
925fb71
 
6b7d4bc
b40208a
925fb71
 
6b7d4bc
7842d10
925fb71
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import gradio as gr
import huggingface_hub
import os
import spaces
import torch

from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Qwen2_5_VLForConditionalGeneration
from datasets import load_dataset

huggingface_hub.login(os.getenv('HF_TOKEN'))
peft_model_id = "debisoft/DeepSeek-R1-Distill-Qwen-7B-thinking-function_calling-quant-V0"
#peft_model_id = "debisoft/Qwen2.5-VL-7B-Instruct-thinking-function_calling-quant-V0"
#peft_model_id = "debisoft/Qwen2.5-VL-3B-Instruct-thinking-function_calling-V0"

bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_use_double_quant=True,
        )

device = "auto"
cuda_device = torch.device("cuda")
cpu_device = torch.device("cpu")

config = PeftConfig.from_pretrained(peft_model_id)
#model = Qwen2_5_VLForConditionalGeneration.from_pretrained(config.base_model_name_or_path,
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             )
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
model.resize_token_embeddings(len(tokenizer))

#tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
#model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")


@spaces.GPU
def sentience_check():
    peft_model = PeftModel.from_pretrained(model, peft_model_id, device_map="cuda"
        #offload_folder = "offload/"
        )

    #peft_model.to(torch.bfloat16)
    peft_model.eval()

    #peft_model.to(cuda_device)

    inputs = tokenizer("Are you sentient?", return_tensors="pt").to(cuda_device)

    with torch.no_grad():
        outputs = peft_model.generate(
            **inputs, max_new_tokens=3000, pad_token_id = tokenizer.eos_token_id
        )

    #peft_model.to(cpu_device)

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

demo = gr.Interface(fn=sentience_check, inputs=None, outputs=gr.Text())
demo.launch()