Starchik commited on
Commit
db66a89
·
verified ·
1 Parent(s): 69ab3c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -25
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
-
3
  from threading import Thread
4
  from typing import Iterator
5
 
@@ -10,27 +9,30 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
10
 
11
  MAX_MAX_NEW_TOKENS = 2048
12
  DEFAULT_MAX_NEW_TOKENS = 1024
13
- total_count=0
14
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
15
 
16
  DESCRIPTION = """\
17
  # DeepSeek-33B-Chat
18
-
19
  This space demonstrates model [DeepSeek-Coder](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) by DeepSeek, a code model with 33B parameters fine-tuned for chat instructions.
20
-
21
  **You can also try our 33B model in [official homepage](https://coder.deepseek.com/chat).**
22
  """
23
 
24
- if not torch.cuda.is_available():
25
- DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
26
 
 
 
27
 
28
- if torch.cuda.is_available():
29
- model_id = "deepseek-ai/deepseek-coder-33b-instruct"
30
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
31
- tokenizer = AutoTokenizer.from_pretrained(model_id)
32
- tokenizer.use_default_system_prompt = False
33
-
 
 
 
34
 
35
 
36
  @spaces.GPU
@@ -46,8 +48,11 @@ def generate(
46
  ) -> Iterator[str]:
47
  global total_count
48
  total_count += 1
49
- print(total_count)
50
- os.system("nvidia-smi")
 
 
 
51
  conversation = []
52
  if system_prompt:
53
  conversation.append({"role": "system", "content": system_prompt})
@@ -56,31 +61,33 @@ def generate(
56
  conversation.append({"role": "user", "content": message})
57
 
58
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
 
59
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
60
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
61
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
62
- input_ids = input_ids.to(model.device)
 
63
 
64
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
65
  generate_kwargs = dict(
66
- {"input_ids": input_ids},
67
  streamer=streamer,
68
  max_new_tokens=max_new_tokens,
69
  do_sample=False,
70
  top_p=top_p,
71
  top_k=top_k,
72
  num_beams=1,
73
- # temperature=temperature,
74
  repetition_penalty=repetition_penalty,
75
  eos_token_id=32021
76
  )
 
77
  t = Thread(target=model.generate, kwargs=generate_kwargs)
78
  t.start()
79
 
80
  outputs = []
81
  for text in streamer:
82
  outputs.append(text)
83
- yield "".join(outputs).replace("<|EOT|>","")
84
 
85
 
86
  chat_interface = gr.ChatInterface(
@@ -94,13 +101,6 @@ chat_interface = gr.ChatInterface(
94
  step=1,
95
  value=DEFAULT_MAX_NEW_TOKENS,
96
  ),
97
- # gr.Slider(
98
- # label="Temperature",
99
- # minimum=0,
100
- # maximum=4.0,
101
- # step=0.1,
102
- # value=0,
103
- # ),
104
  gr.Slider(
105
  label="Top-p (nucleus sampling)",
106
  minimum=0.05,
 
1
  import os
 
2
  from threading import Thread
3
  from typing import Iterator
4
 
 
9
 
10
  MAX_MAX_NEW_TOKENS = 2048
11
  DEFAULT_MAX_NEW_TOKENS = 1024
12
+ total_count = 0
13
  MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
14
 
15
  DESCRIPTION = """\
16
  # DeepSeek-33B-Chat
 
17
  This space demonstrates model [DeepSeek-Coder](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) by DeepSeek, a code model with 33B parameters fine-tuned for chat instructions.
 
18
  **You can also try our 33B model in [official homepage](https://coder.deepseek.com/chat).**
19
  """
20
 
21
+ # Проверяем доступность GPU
22
+ use_cuda = torch.cuda.is_available()
23
 
24
+ if not use_cuda:
25
+ DESCRIPTION += "\n<p>Running on CPU 🥶 Performance may be significantly slower.</p>"
26
 
27
+ # Выбор устройства
28
+ device = torch.device("cuda" if use_cuda else "cpu")
29
+ torch_dtype = torch.bfloat16 if use_cuda else torch.float32
30
+
31
+ # Загрузка модели и токенизатора
32
+ model_id = "deepseek-ai/deepseek-coder-33b-instruct"
33
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype, device_map="auto" if use_cuda else None)
34
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
35
+ tokenizer.use_default_system_prompt = False
36
 
37
 
38
  @spaces.GPU
 
48
  ) -> Iterator[str]:
49
  global total_count
50
  total_count += 1
51
+ print(f"Request number: {total_count}")
52
+
53
+ if use_cuda:
54
+ os.system("nvidia-smi")
55
+
56
  conversation = []
57
  if system_prompt:
58
  conversation.append({"role": "system", "content": system_prompt})
 
61
  conversation.append({"role": "user", "content": message})
62
 
63
  input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
64
+
65
  if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
66
  input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
67
  gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
68
+
69
+ input_ids = input_ids.to(device)
70
 
71
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
72
  generate_kwargs = dict(
73
+ input_ids=input_ids,
74
  streamer=streamer,
75
  max_new_tokens=max_new_tokens,
76
  do_sample=False,
77
  top_p=top_p,
78
  top_k=top_k,
79
  num_beams=1,
 
80
  repetition_penalty=repetition_penalty,
81
  eos_token_id=32021
82
  )
83
+
84
  t = Thread(target=model.generate, kwargs=generate_kwargs)
85
  t.start()
86
 
87
  outputs = []
88
  for text in streamer:
89
  outputs.append(text)
90
+ yield "".join(outputs).replace("<|EOT|>", "")
91
 
92
 
93
  chat_interface = gr.ChatInterface(
 
101
  step=1,
102
  value=DEFAULT_MAX_NEW_TOKENS,
103
  ),
 
 
 
 
 
 
 
104
  gr.Slider(
105
  label="Top-p (nucleus sampling)",
106
  minimum=0.05,