Starchik commited on
Commit
56b32b5
·
verified ·
1 Parent(s): db66a89

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -26
app.py CHANGED
@@ -18,22 +18,22 @@ This space demonstrates model [DeepSeek-Coder](https://huggingface.co/deepseek-a
18
  **You can also try our 33B model in [official homepage](https://coder.deepseek.com/chat).**
19
  """
20
 
21
- # Проверяем доступность GPU
22
- use_cuda = torch.cuda.is_available()
23
-
24
- if not use_cuda:
25
- DESCRIPTION += "\n<p>Running on CPU 🥶 Performance may be significantly slower.</p>"
26
-
27
- # Выбор устройства
28
- device = torch.device("cuda" if use_cuda else "cpu")
29
- torch_dtype = torch.bfloat16 if use_cuda else torch.float32
30
-
31
- # Загрузка модели и токенизатора
32
- model_id = "deepseek-ai/deepseek-coder-33b-instruct"
33
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype, device_map="auto" if use_cuda else None)
34
- tokenizer = AutoTokenizer.from_pretrained(model_id)
35
- tokenizer.use_default_system_prompt = False
36
-
37
 
38
  @spaces.GPU
39
  def generate(
@@ -48,11 +48,8 @@ def generate(
48
  ) -> Iterator[str]:
49
  global total_count
50
  total_count += 1
51
- print(f"Request number: {total_count}")
52
-
53
- if use_cuda:
54
- os.system("nvidia-smi")
55
-
56
  conversation = []
57
  if system_prompt:
58
  conversation.append({"role": "system", "content": system_prompt})
@@ -61,16 +58,14 @@ def generate(
61
  conversation.append({"role": "user", "content": message})
62
 
63
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
64
-
65
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
66
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
67
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
68
-
69
  input_ids = input_ids.to(device)
70
 
71
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
72
  generate_kwargs = dict(
73
- input_ids=input_ids,
74
  streamer=streamer,
75
  max_new_tokens=max_new_tokens,
76
  do_sample=False,
@@ -80,7 +75,6 @@ def generate(
80
  repetition_penalty=repetition_penalty,
81
  eos_token_id=32021
82
  )
83
-
84
  t = Thread(target=model.generate, kwargs=generate_kwargs)
85
  t.start()
86
 
@@ -89,7 +83,6 @@ def generate(
89
  outputs.append(text)
90
  yield "".join(outputs).replace("<|EOT|>", "")
91
 
92
-
93
  chat_interface = gr.ChatInterface(
94
  fn=generate,
95
  additional_inputs=[
 
18
  **You can also try our 33B model in [official homepage](https://coder.deepseek.com/chat).**
19
  """
20
 
21
+ # Check if CUDA is available
22
+ if not torch.cuda.is_available():
23
+ DESCRIPTION += "\n<p>Running on CPU 🥶 This demo might be slow on CPU.</p>"
24
+ device = torch.device("cpu")
25
+ else:
26
+ device = torch.device("cuda")
27
+ model_id = "deepseek-ai/deepseek-coder-33b-instruct"
28
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
29
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
30
+ tokenizer.use_default_system_prompt = False
31
+
32
+ # Fallback to CPU for model loading if CUDA is unavailable
33
+ if not torch.cuda.is_available():
34
+ model_id = "deepseek-ai/deepseek-coder-33b-instruct"
35
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu")
36
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
37
 
38
  @spaces.GPU
39
  def generate(
 
48
  ) -> Iterator[str]:
49
  global total_count
50
  total_count += 1
51
+ print(total_count)
52
+ os.system("nvidia-smi")
 
 
 
53
  conversation = []
54
  if system_prompt:
55
  conversation.append({"role": "system", "content": system_prompt})
 
58
  conversation.append({"role": "user", "content": message})
59
 
60
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
 
61
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
62
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
63
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
 
64
  input_ids = input_ids.to(device)
65
 
66
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
67
  generate_kwargs = dict(
68
+ {"input_ids": input_ids},
69
  streamer=streamer,
70
  max_new_tokens=max_new_tokens,
71
  do_sample=False,
 
75
  repetition_penalty=repetition_penalty,
76
  eos_token_id=32021
77
  )
 
78
  t = Thread(target=model.generate, kwargs=generate_kwargs)
79
  t.start()
80
 
 
83
  outputs.append(text)
84
  yield "".join(outputs).replace("<|EOT|>", "")
85
 
 
86
  chat_interface = gr.ChatInterface(
87
  fn=generate,
88
  additional_inputs=[