runninglsy commited on
Commit
54299ef
·
1 Parent(s): 648c219

Update examples and track new files with Git LFS

Browse files
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.png filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  *.png filter=lfs diff=lfs merge=lfs -text
37
+ *.jpg filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🦫
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.15.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.1.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
app.py CHANGED
@@ -4,14 +4,16 @@ subprocess.run('pip install flash-attn==2.7.0.post2 --no-build-isolation', env={
4
  import spaces
5
  import os
6
  import re
7
- import time
8
- import gradio as gr
9
- import torch
10
- from transformers import AutoModelForCausalLM
11
- from transformers import TextIteratorStreamer
12
  from threading import Thread
13
 
 
 
 
 
14
  model_name = 'AIDC-AI/Ovis2-16B'
 
15
 
16
  # load model
17
  model = AutoModelForCausalLM.from_pretrained(model_name,
@@ -29,82 +31,106 @@ def submit_chat(chatbot, text_input):
29
  chatbot.append((text_input, response))
30
  return chatbot ,''
31
 
32
- @spaces.GPU
33
- def ovis_chat(chatbot, image_input):
34
- # preprocess inputs
35
- conversations = [{
36
- "from": "system",
37
- "value": "You are a helpful assistant, and your task is to provide reliable and structured responses to users."
38
- }]
39
- response = ""
40
- text_input = chatbot[-1][0]
41
- for query, response in chatbot[:-1]:
42
- conversations.append({
43
- "from": "human",
44
- "value": query
45
- })
46
- conversations.append({
47
- "from": "gpt",
48
- "value": response
49
- })
50
- text_input = text_input.replace(image_placeholder, '')
51
- conversations.append({
52
- "from": "human",
53
- "value": text_input
54
- })
55
- if image_input is not None:
56
- conversations[1]["value"] = image_placeholder + '\n' + conversations[1]["value"]
57
- prompt, input_ids, pixel_values = model.preprocess_inputs(conversations, [image_input])
58
- attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
59
- input_ids = input_ids.unsqueeze(0).to(device=model.device)
60
- attention_mask = attention_mask.unsqueeze(0).to(device=model.device)
61
- if image_input is None:
62
- pixel_values = [None]
63
- else:
64
- pixel_values = [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)]
 
 
 
 
 
 
65
 
66
  with torch.inference_mode():
67
- gen_kwargs = dict(
68
- max_new_tokens=1536,
69
- do_sample=False,
70
- top_p=None,
71
- top_k=None,
72
- temperature=None,
73
- repetition_penalty=None,
74
- eos_token_id=model.generation_config.eos_token_id,
75
- pad_token_id=text_tokenizer.pad_token_id,
76
- use_cache=True
77
- )
78
  response = ""
79
- # thread = Thread(target=model.generate,
80
- # kwargs={"inputs": input_ids,
81
- # "pixel_values": pixel_values,
82
- # "attention_mask": attention_mask,
83
- # "streamer": streamer,
84
- # **gen_kwargs})
85
- model.generate(
86
- input_ids,
87
- pixel_values=pixel_values,
88
- attention_mask=attention_mask,
89
- streamer=streamer,
90
- **gen_kwargs
91
- )
92
- # thread.start()
93
  for new_text in streamer:
94
  response += new_text
95
  chatbot[-1][1] = response
96
  yield chatbot
97
- # thread.join()
98
- # debug
99
- print('*'*60)
100
- print('*'*60)
101
- print('OVIS_CONV_START')
102
- for i, (request, answer) in enumerate(chatbot[:-1], 1):
103
- print(f'Q{i}:\n {request}')
104
- print(f'A{i}:\n {answer}')
105
- print('New_Q:\n', text_input)
106
- print('New_A:\n', response)
107
- print('OVIS_CONV_END')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  def clear_chat():
110
  return [], None, ""
@@ -124,7 +150,7 @@ html = f"""
124
  latex_delimiters_set = [{
125
  "left": "\\(",
126
  "right": "\\)",
127
- "display": True
128
  }, {
129
  "left": "\\begin{equation}",
130
  "right": "\\end{equation}",
@@ -159,9 +185,11 @@ with gr.Blocks(title=model_name.split('/')[-1], theme=gr.themes.Ocean()) as demo
159
  image_input = gr.Image(label="image", height=350, type="pil")
160
  gr.Examples(
161
  examples=[
162
- [f"{cur_dir}/examples/case0.png", "Find the area of the shaded region."],
163
- [f"{cur_dir}/examples/case1.png", "explain this model to me."],
164
- [f"{cur_dir}/examples/case2.png", "What is net profit margin as a percentage of total revenue?"],
 
 
165
  ],
166
  inputs=[image_input, text_input]
167
  )
 
4
  import spaces
5
  import os
6
  import re
7
+ import logging
8
+ from typing import List, Any
 
 
 
9
  from threading import Thread
10
 
11
+ import torch
12
+ import gradio as gr
13
+ from transformers import AutoModelForCausalLM, TextIteratorStreamer
14
+
15
  model_name = 'AIDC-AI/Ovis2-16B'
16
+ use_thread = False
17
 
18
  # load model
19
  model = AutoModelForCausalLM.from_pretrained(model_name,
 
31
  chatbot.append((text_input, response))
32
  return chatbot ,''
33
 
34
+ # @spaces.GPU
35
+ use_thread = False
36
+
37
+ # load model
38
+ model = AutoModelForCausalLM.from_pretrained(model_name,
39
+ torch_dtype=torch.bfloat16,
40
+ multimodal_max_length=8192,
41
+ trust_remote_code=True).to(device='cuda')
42
+ text_tokenizer = model.get_text_tokenizer()
43
+ visual_tokenizer = model.get_visual_tokenizer()
44
+ streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
45
+ image_placeholder = '<image>'
46
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
47
+
48
+ logging.basicConfig(level=logging.INFO)
49
+ logger = logging.getLogger(__name__)
50
+
51
+ def initialize_gen_kwargs():
52
+ return {
53
+ "max_new_tokens": 1536,
54
+ "do_sample": False,
55
+ "top_p": None,
56
+ "top_k": None,
57
+ "temperature": None,
58
+ "repetition_penalty": 1.05,
59
+ "eos_token_id": model.generation_config.eos_token_id,
60
+ "pad_token_id": text_tokenizer.pad_token_id,
61
+ "use_cache": True
62
+ }
63
+
64
+ def submit_chat(chatbot, text_input):
65
+ response = ''
66
+ chatbot.append((text_input, response))
67
+ return chatbot ,''
68
+
69
+ # @spaces.GPU
70
+ def ovis_chat(chatbot: List[List[str]], image_input: Any):
71
+ conversations, model_inputs = prepare_inputs(chatbot, image_input)
72
+ gen_kwargs = initialize_gen_kwargs()
73
 
74
  with torch.inference_mode():
75
+ generate_func = lambda: model.generate(**model_inputs, **gen_kwargs, streamer=streamer)
76
+
77
+ if use_thread:
78
+ thread = Thread(target=generate_func)
79
+ thread.start()
80
+ else:
81
+ generate_func()
82
+
 
 
 
83
  response = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  for new_text in streamer:
85
  response += new_text
86
  chatbot[-1][1] = response
87
  yield chatbot
88
+
89
+ if use_thread:
90
+ thread.join()
91
+
92
+ log_conversation(chatbot)
93
+
94
+
95
+ def prepare_inputs(chatbot: List[List[str]], image_input: Any):
96
+ # conversations = [{
97
+ # "from": "system",
98
+ # "value": "You are a helpful assistant, and your task is to provide reliable and structured responses to users."
99
+ # }]
100
+ conversations= []
101
+
102
+ for query, response in chatbot[:-1]:
103
+ conversations.extend([
104
+ {"from": "human", "value": query},
105
+ {"from": "gpt", "value": response}
106
+ ])
107
+
108
+ last_query = chatbot[-1][0].replace(image_placeholder, '')
109
+ conversations.append({"from": "human", "value": last_query})
110
+
111
+ if image_input is not None:
112
+ for conv in conversations:
113
+ if conv["from"] == "human":
114
+ conv["value"] = f'{image_placeholder}\n{conv["value"]}'
115
+ break
116
+
117
+ logger.info(conversations)
118
+
119
+ prompt, input_ids, pixel_values = model.preprocess_inputs(conversations, [image_input], max_partition=16)
120
+ attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
121
+
122
+ model_inputs = {
123
+ "inputs": input_ids.unsqueeze(0).to(device=model.device),
124
+ "attention_mask": attention_mask.unsqueeze(0).to(device=model.device),
125
+ "pixel_values": [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)] if image_input is not None else [None]
126
+ }
127
+
128
+ return conversations, model_inputs
129
+
130
+ def log_conversation(chatbot):
131
+ logger.info("[OVIS_CONV_START]")
132
+ [print(f'Q{i}:\n {request}\nA{i}:\n {answer}') for i, (request, answer) in enumerate(chatbot, 1)]
133
+ logger.info("[OVIS_CONV_END]")
134
 
135
  def clear_chat():
136
  return [], None, ""
 
150
  latex_delimiters_set = [{
151
  "left": "\\(",
152
  "right": "\\)",
153
+ "display": False
154
  }, {
155
  "left": "\\begin{equation}",
156
  "right": "\\end{equation}",
 
185
  image_input = gr.Image(label="image", height=350, type="pil")
186
  gr.Examples(
187
  examples=[
188
+ [f"{cur_dir}/examples/ovis2_math0.jpg", "Each face of the polyhedron shown is either a triangle or a square. Each square borders 4 triangles, and each triangle borders 3 squares. The polyhedron has 6 squares. How many triangles does it have?\n\nProvide a step-by-step solution to the problem, and conclude with 'the answer is' followed by the final solution."],
189
+ [f"{cur_dir}/examples/ovis2_math1.jpg", "A large square touches another two squares, as shown in the picture. The numbers inside the smaller squares indicate their areas. What is the area of the largest square?\n\nProvide a step-by-step solution to the problem, and conclude with 'the answer is' followed by the final solution."],
190
+ [f"{cur_dir}/examples/ovis2_figure0.png", "Explain this model."],
191
+ [f"{cur_dir}/examples/ovis2_figure1.png", "Extract the notes about PPO and GRPO in the figure, paying attention to readability."],
192
+ [f"{cur_dir}/examples/ovis2_multi0.jpg", "Posso avere un frappuccino e un caffè americano di taglia M? Quanto costa in totale?"],
193
  ],
194
  inputs=[image_input, text_input]
195
  )
examples/{case1.png → ovis2_figure0.png} RENAMED
File without changes
examples/ovis2_figure1.png ADDED

Git LFS Details

  • SHA256: 424846820189aad49941d3efba10f3e66925a9c80204bab297bc3d120a0fed4d
  • Pointer size: 132 Bytes
  • Size of remote file: 1.21 MB
examples/{case0.png → ovis2_math0.jpg} RENAMED
File without changes
examples/{case2.png → ovis2_math1.jpg} RENAMED
File without changes
examples/ovis2_multi0.jpg ADDED

Git LFS Details

  • SHA256: 66f1f86d24b0f334f039165ebd1ec3e83cefcf7b8bea87e9ec2d42a09c1f84e5
  • Pointer size: 132 Bytes
  • Size of remote file: 3.41 MB