deepseek-coder-33b-instruct

Runtime error

App Files Files Community

Starchik commited on Jan 29

Commit

db66a89

verified ·

1 Parent(s): 69ab3c6

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -25

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import os
 from threading import Thread
 from typing import Iterator
@@ -10,27 +9,30 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
-total_count=0
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """\
 # DeepSeek-33B-Chat
 This space demonstrates model [DeepSeek-Coder](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) by DeepSeek, a code model with 33B parameters fine-tuned for chat instructions.
 **You can also try our 33B model in [official homepage](https://coder.deepseek.com/chat).**
 """
-if not torch.cuda.is_available():
-    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
-if torch.cuda.is_available():
-    model_id = "deepseek-ai/deepseek-coder-33b-instruct"
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    tokenizer.use_default_system_prompt = False
 @spaces.GPU
@@ -46,8 +48,11 @@ def generate(
 ) -> Iterator[str]:
     global total_count
     total_count += 1
-    print(total_count)
-    os.system("nvidia-smi")
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
@@ -56,31 +61,33 @@ def generate(
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = input_ids.to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=False,
         top_p=top_p,
         top_k=top_k,
         num_beams=1,
-        # temperature=temperature,
         repetition_penalty=repetition_penalty,
         eos_token_id=32021
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
         outputs.append(text)
-        yield "".join(outputs).replace("<|EOT|>","")
 chat_interface = gr.ChatInterface(
@@ -94,13 +101,6 @@ chat_interface = gr.ChatInterface(
             step=1,
             value=DEFAULT_MAX_NEW_TOKENS,
         ),
-        # gr.Slider(
-        #     label="Temperature",
-        #     minimum=0,
-        #     maximum=4.0,
-        #     step=0.1,
-        #     value=0,
-        # ),
         gr.Slider(
             label="Top-p (nucleus sampling)",
             minimum=0.05,

 import os
 from threading import Thread
 from typing import Iterator
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
+total_count = 0
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 DESCRIPTION = """\
 # DeepSeek-33B-Chat
 This space demonstrates model [DeepSeek-Coder](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) by DeepSeek, a code model with 33B parameters fine-tuned for chat instructions.
 **You can also try our 33B model in [official homepage](https://coder.deepseek.com/chat).**
 """
+# Проверяем доступность GPU
+use_cuda = torch.cuda.is_available()
+if not use_cuda:
+    DESCRIPTION += "\n<p>Running on CPU 🥶 Performance may be significantly slower.</p>"
+# Выбор устройства
+device = torch.device("cuda" if use_cuda else "cpu")
+torch_dtype = torch.bfloat16 if use_cuda else torch.float32
+# Загрузка модели и токенизатора
+model_id = "deepseek-ai/deepseek-coder-33b-instruct"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype, device_map="auto" if use_cuda else None)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer.use_default_system_prompt = False
 @spaces.GPU
 ) -> Iterator[str]:
     global total_count
     total_count += 1
+    print(f"Request number: {total_count}")
+    if use_cuda:
+        os.system("nvidia-smi")
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        input_ids=input_ids,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=False,
         top_p=top_p,
         top_k=top_k,
         num_beams=1,
         repetition_penalty=repetition_penalty,
         eos_token_id=32021
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
         outputs.append(text)
+        yield "".join(outputs).replace("<|EOT|>", "")
 chat_interface = gr.ChatInterface(
             step=1,
             value=DEFAULT_MAX_NEW_TOKENS,
         ),
         gr.Slider(
             label="Top-p (nucleus sampling)",
             minimum=0.05,