multimodalart HF Staff commited on
Commit
7f1c6a8
·
verified ·
1 Parent(s): a001585

All in global context

Browse files
Files changed (1) hide show
  1. app.py +138 -144
app.py CHANGED
@@ -15,154 +15,153 @@ import time
15
  MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
16
  stop_generation = False
17
 
18
- processor = None
19
- model = None
20
-
21
- def load_model():
22
- """加载模型和处理器"""
23
- global processor, model
24
- processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
25
- model = Glm4vForConditionalGeneration.from_pretrained(
26
- MODEL_PATH,
27
- torch_dtype=torch.bfloat16,
28
- device_map="auto",
29
- attn_implementation="sdpa",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  )
 
 
31
 
32
- class GLM4VModel:
33
- def __init__(self):
34
- pass
35
-
36
- def _strip_html(self, t):
37
- return re.sub(r"<[^>]+>", "", t).strip()
38
-
39
- def _wrap_text(self, t):
40
- return [{"type": "text", "text": t}]
41
-
42
- def _pdf_to_imgs(self, pdf_path):
43
- doc = fitz.open(pdf_path)
44
- imgs = []
45
- for i in range(doc.page_count):
46
- pix = doc.load_page(i).get_pixmap(dpi=180)
47
- img_p = os.path.join(tempfile.gettempdir(), f"{Path(pdf_path).stem}_{i}.png")
48
- pix.save(img_p)
49
- imgs.append(img_p)
50
- doc.close()
51
- return imgs
52
-
53
- def _ppt_to_imgs(self, ppt_path):
54
- tmp = tempfile.mkdtemp()
55
- subprocess.run(
56
- ["libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmp, ppt_path],
57
- check=True,
58
- )
59
- pdf_path = os.path.join(tmp, Path(ppt_path).stem + ".pdf")
60
- return self._pdf_to_imgs(pdf_path)
61
-
62
- def _files_to_content(self, media):
63
- out = []
64
- for f in media or []:
65
- ext = Path(f.name).suffix.lower()
66
- if ext in [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".m4v"]:
67
- out.append({"type": "video", "url": f.name})
68
- elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
69
- out.append({"type": "image", "url": f.name})
70
- elif ext in [".ppt", ".pptx"]:
71
- for p in self._ppt_to_imgs(f.name):
72
- out.append({"type": "image", "url": p})
73
- elif ext == ".pdf":
74
- for p in self._pdf_to_imgs(f.name):
75
- out.append({"type": "image", "url": p})
76
- return out
77
-
78
- def _stream_fragment(self, buf: str) -> str:
79
- think_html = ""
80
- if "<think>" in buf:
81
- if "</think>" in buf:
82
- seg = re.search(r"<think>(.*?)</think>", buf, re.DOTALL)
83
- if seg:
84
- think_html = (
85
- "<details open><summary style='cursor:pointer;font-weight:bold;color:#bbbbbb;'>💭 Thinking</summary>"
86
- "<div style='color:#cccccc;line-height:1.4;padding:10px;border-left:3px solid #666;margin:5px 0;background-color:rgba(128,128,128,0.1);'>"
87
- + seg.group(1).strip().replace("\n", "<br>")
88
- + "</div></details>"
89
- )
90
- else:
91
- part = buf.split("<think>", 1)[1]
92
  think_html = (
93
  "<details open><summary style='cursor:pointer;font-weight:bold;color:#bbbbbb;'>💭 Thinking</summary>"
94
  "<div style='color:#cccccc;line-height:1.4;padding:10px;border-left:3px solid #666;margin:5px 0;background-color:rgba(128,128,128,0.1);'>"
95
- + part.replace("\n", "<br>")
96
  + "</div></details>"
97
  )
 
 
 
 
 
 
 
 
98
 
99
- answer_html = ""
100
- if "<answer>" in buf:
101
- if "</answer>" in buf:
102
- seg = re.search(r"<answer>(.*?)</answer>", buf, re.DOTALL)
103
- if seg:
104
- answer_html = seg.group(1).strip()
105
- else:
106
- answer_html = buf.split("<answer>", 1)[1]
107
 
108
- if not think_html and not answer_html:
109
- return self._strip_html(buf)
110
- return think_html + answer_html
111
 
112
- def _build_messages(self, raw_hist, sys_prompt):
113
- msgs = []
114
 
115
- if sys_prompt.strip():
116
- msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
117
 
118
- for h in raw_hist:
119
- if h["role"] == "user":
120
- msgs.append({"role": "user", "content": h["content"]})
121
- else:
122
- raw = h["content"]
123
- raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL)
124
- raw = re.sub(r"<details.*?</details>", "", raw, flags=re.DOTALL)
125
- clean = self._strip_html(raw).strip()
126
- msgs.append({"role": "assistant", "content": self._wrap_text(clean)})
127
- return msgs
128
-
129
- @spaces.GPU(duration=240)
130
- def stream_generate(self, raw_hist, sys_prompt):
131
- global stop_generation, processor, model
132
- stop_generation = False
133
- msgs = self._build_messages(raw_hist, sys_prompt)
134
- inputs = processor.apply_chat_template(
135
- msgs,
136
- tokenize=True,
137
- add_generation_prompt=True,
138
- return_dict=True,
139
- return_tensors="pt",
140
- padding=True,
141
- ).to(model.device)
142
-
143
- streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=False)
144
- gen_args = dict(
145
- inputs,
146
- max_new_tokens=8192,
147
- repetition_penalty=1.1,
148
- do_sample=True,
149
- top_k=2,
150
- temperature=None,
151
- top_p=1e-5,
152
- streamer=streamer,
153
- )
154
-
155
- generation_thread = threading.Thread(target=model.generate, kwargs=gen_args)
156
- generation_thread.start()
157
-
158
- buf = ""
159
- for tok in streamer:
160
- if stop_generation:
161
- break
162
- buf += tok
163
- yield self._stream_fragment(buf)
164
-
165
- generation_thread.join()
 
 
 
 
166
 
167
 
168
  def format_display_content(content):
@@ -193,11 +192,6 @@ def create_display_history(raw_hist):
193
  return display_hist
194
 
195
 
196
- # 加载模型和处理器
197
- load_model()
198
- glm4v = GLM4VModel()
199
-
200
-
201
  def check_files(files):
202
  vids = imgs = ppts = pdfs = 0
203
  for f in files or []:
@@ -230,10 +224,10 @@ def chat(files, msg, raw_hist, sys_prompt):
230
  yield display_hist, copy.deepcopy(raw_hist), None, ""
231
  return
232
 
233
- payload = glm4v._files_to_content(files) if files else None
234
  if msg.strip():
235
  if payload is None:
236
- payload = glm4v._wrap_text(msg.strip())
237
  else:
238
  payload.append({"type": "text", "text": msg.strip()})
239
 
@@ -248,7 +242,7 @@ def chat(files, msg, raw_hist, sys_prompt):
248
  display_hist = create_display_history(raw_hist)
249
  yield display_hist, copy.deepcopy(raw_hist), None, ""
250
 
251
- for chunk in glm4v.stream_generate(raw_hist[:-1], sys_prompt):
252
  if stop_generation:
253
  break
254
  place["content"] = chunk
 
15
  MODEL_PATH = "THUDM/GLM-4.1V-9B-Thinking"
16
  stop_generation = False
17
 
18
+ # Global model and processor
19
+ processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
20
+ model = Glm4vForConditionalGeneration.from_pretrained(
21
+ MODEL_PATH,
22
+ torch_dtype=torch.bfloat16,
23
+ device_map="auto",
24
+ attn_implementation="sdpa",
25
+ )
26
+
27
+
28
+ def _strip_html(t):
29
+ return re.sub(r"<[^>]+>", "", t).strip()
30
+
31
+
32
+ def _wrap_text(t):
33
+ return [{"type": "text", "text": t}]
34
+
35
+
36
+ def _pdf_to_imgs(pdf_path):
37
+ doc = fitz.open(pdf_path)
38
+ imgs = []
39
+ for i in range(doc.page_count):
40
+ pix = doc.load_page(i).get_pixmap(dpi=180)
41
+ img_p = os.path.join(tempfile.gettempdir(), f"{Path(pdf_path).stem}_{i}.png")
42
+ pix.save(img_p)
43
+ imgs.append(img_p)
44
+ doc.close()
45
+ return imgs
46
+
47
+
48
+ def _ppt_to_imgs(ppt_path):
49
+ tmp = tempfile.mkdtemp()
50
+ subprocess.run(
51
+ ["libreoffice", "--headless", "--convert-to", "pdf", "--outdir", tmp, ppt_path],
52
+ check=True,
53
  )
54
+ pdf_path = os.path.join(tmp, Path(ppt_path).stem + ".pdf")
55
+ return _pdf_to_imgs(pdf_path)
56
 
57
+
58
+ def _files_to_content(media):
59
+ out = []
60
+ for f in media or []:
61
+ ext = Path(f.name).suffix.lower()
62
+ if ext in [".mp4", ".avi", ".mkv", ".mov", ".wmv", ".flv", ".webm", ".mpeg", ".m4v"]:
63
+ out.append({"type": "video", "url": f.name})
64
+ elif ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".webp"]:
65
+ out.append({"type": "image", "url": f.name})
66
+ elif ext in [".ppt", ".pptx"]:
67
+ for p in _ppt_to_imgs(f.name):
68
+ out.append({"type": "image", "url": p})
69
+ elif ext == ".pdf":
70
+ for p in _pdf_to_imgs(f.name):
71
+ out.append({"type": "image", "url": p})
72
+ return out
73
+
74
+
75
+ def _stream_fragment(buf: str) -> str:
76
+ think_html = ""
77
+ if "<think>" in buf:
78
+ if "</think>" in buf:
79
+ seg = re.search(r"<think>(.*?)</think>", buf, re.DOTALL)
80
+ if seg:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  think_html = (
82
  "<details open><summary style='cursor:pointer;font-weight:bold;color:#bbbbbb;'>💭 Thinking</summary>"
83
  "<div style='color:#cccccc;line-height:1.4;padding:10px;border-left:3px solid #666;margin:5px 0;background-color:rgba(128,128,128,0.1);'>"
84
+ + seg.group(1).strip().replace("\n", "<br>")
85
  + "</div></details>"
86
  )
87
+ else:
88
+ part = buf.split("<think>", 1)[1]
89
+ think_html = (
90
+ "<details open><summary style='cursor:pointer;font-weight:bold;color:#bbbbbb;'>💭 Thinking</summary>"
91
+ "<div style='color:#cccccc;line-height:1.4;padding:10px;border-left:3px solid #666;margin:5px 0;background-color:rgba(128,128,128,0.1);'>"
92
+ + part.replace("\n", "<br>")
93
+ + "</div></details>"
94
+ )
95
 
96
+ answer_html = ""
97
+ if "<answer>" in buf:
98
+ if "</answer>" in buf:
99
+ seg = re.search(r"<answer>(.*?)</answer>", buf, re.DOTALL)
100
+ if seg:
101
+ answer_html = seg.group(1).strip()
102
+ else:
103
+ answer_html = buf.split("<answer>", 1)[1]
104
 
105
+ if not think_html and not answer_html:
106
+ return _strip_html(buf)
107
+ return think_html + answer_html
108
 
 
 
109
 
110
+ def _build_messages(raw_hist, sys_prompt):
111
+ msgs = []
112
 
113
+ if sys_prompt.strip():
114
+ msgs.append({"role": "system", "content": [{"type": "text", "text": sys_prompt.strip()}]})
115
+
116
+ for h in raw_hist:
117
+ if h["role"] == "user":
118
+ msgs.append({"role": "user", "content": h["content"]})
119
+ else:
120
+ raw = h["content"]
121
+ raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL)
122
+ raw = re.sub(r"<details.*?</details>", "", raw, flags=re.DOTALL)
123
+ clean = _strip_html(raw).strip()
124
+ msgs.append({"role": "assistant", "content": _wrap_text(clean)})
125
+ return msgs
126
+
127
+
128
+ @spaces.GPU(duration=240)
129
+ def stream_generate(raw_hist, sys_prompt):
130
+ global stop_generation
131
+ stop_generation = False
132
+ msgs = _build_messages(raw_hist, sys_prompt)
133
+ inputs = processor.apply_chat_template(
134
+ msgs,
135
+ tokenize=True,
136
+ add_generation_prompt=True,
137
+ return_dict=True,
138
+ return_tensors="pt",
139
+ padding=True,
140
+ ).to(model.device)
141
+
142
+ streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=False)
143
+ gen_args = dict(
144
+ inputs,
145
+ max_new_tokens=8192,
146
+ repetition_penalty=1.1,
147
+ do_sample=True,
148
+ top_k=2,
149
+ temperature=None,
150
+ top_p=1e-5,
151
+ streamer=streamer,
152
+ )
153
+
154
+ generation_thread = threading.Thread(target=model.generate, kwargs=gen_args)
155
+ generation_thread.start()
156
+
157
+ buf = ""
158
+ for tok in streamer:
159
+ if stop_generation:
160
+ break
161
+ buf += tok
162
+ yield _stream_fragment(buf)
163
+
164
+ generation_thread.join()
165
 
166
 
167
  def format_display_content(content):
 
192
  return display_hist
193
 
194
 
 
 
 
 
 
195
  def check_files(files):
196
  vids = imgs = ppts = pdfs = 0
197
  for f in files or []:
 
224
  yield display_hist, copy.deepcopy(raw_hist), None, ""
225
  return
226
 
227
+ payload = _files_to_content(files) if files else None
228
  if msg.strip():
229
  if payload is None:
230
+ payload = _wrap_text(msg.strip())
231
  else:
232
  payload.append({"type": "text", "text": msg.strip()})
233
 
 
242
  display_hist = create_display_history(raw_hist)
243
  yield display_hist, copy.deepcopy(raw_hist), None, ""
244
 
245
+ for chunk in stream_generate(raw_hist[:-1], sys_prompt):
246
  if stop_generation:
247
  break
248
  place["content"] = chunk