Spaces:

AIDC-AI
/

Ovis2-16B

Running on Zero

App Files Files Community

runninglsy commited on Feb 12

Commit

54299ef

1 Parent(s): 648c219

Update examples and track new files with Git LFS

Browse files

Files changed (8) hide show

.gitattributes +1 -0
README.md +1 -1
app.py +106 -78
examples/{case1.png → ovis2_figure0.png} +0 -0
examples/ovis2_figure1.png +3 -0
examples/{case0.png → ovis2_math0.jpg} +2 -2
examples/{case2.png → ovis2_math1.jpg} +2 -2
examples/ovis2_multi0.jpg +3 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🦫
 colorFrom: blue
 colorTo: red
 sdk: gradio
-sdk_version: 5.15.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: blue
 colorTo: red
 sdk: gradio
+sdk_version: 5.1.0
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -4,14 +4,16 @@ subprocess.run('pip install flash-attn==2.7.0.post2 --no-build-isolation', env={
 import spaces
 import os
 import re
-import time
-import gradio as gr
-import torch
-from transformers import AutoModelForCausalLM
-from transformers import TextIteratorStreamer
 from threading import Thread
 model_name = 'AIDC-AI/Ovis2-16B'
 # load model
 model = AutoModelForCausalLM.from_pretrained(model_name,
@@ -29,82 +31,106 @@ def submit_chat(chatbot, text_input):
     chatbot.append((text_input, response))
     return chatbot ,''
-@spaces.GPU
-def ovis_chat(chatbot, image_input):
-    # preprocess inputs
-    conversations = [{
-        "from": "system",
-        "value": "You are a helpful assistant, and your task is to provide reliable and structured responses to users."
-    }]
-    response = ""
-    text_input = chatbot[-1][0]
-    for query, response in chatbot[:-1]:
-        conversations.append({
-            "from": "human",
-            "value": query
-        })
-        conversations.append({
-            "from": "gpt",
-            "value": response
-        })
-    text_input = text_input.replace(image_placeholder, '')
-    conversations.append({
-        "from": "human",
-        "value": text_input
-    })
-    if image_input is not None:
-        conversations[1]["value"] = image_placeholder + '\n' + conversations[1]["value"]
-    prompt, input_ids, pixel_values = model.preprocess_inputs(conversations, [image_input])
-    attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
-    input_ids = input_ids.unsqueeze(0).to(device=model.device)
-    attention_mask = attention_mask.unsqueeze(0).to(device=model.device)
-    if image_input is None:
-        pixel_values = [None]
-    else:
-        pixel_values = [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)]
     with torch.inference_mode():
-        gen_kwargs = dict(
-            max_new_tokens=1536,
-            do_sample=False,
-            top_p=None,
-            top_k=None,
-            temperature=None,
-            repetition_penalty=None,
-            eos_token_id=model.generation_config.eos_token_id,
-            pad_token_id=text_tokenizer.pad_token_id,
-            use_cache=True
-        )
         response = ""
-        # thread = Thread(target=model.generate,
-        #             kwargs={"inputs": input_ids,
-        #                     "pixel_values": pixel_values,
-        #                     "attention_mask": attention_mask,
-        #                     "streamer": streamer,
-        #                     **gen_kwargs})
-        model.generate(
-                input_ids,
-                pixel_values=pixel_values,
-                attention_mask=attention_mask,
-                streamer=streamer,
-                **gen_kwargs
-            )
-        # thread.start()
         for new_text in streamer:
             response += new_text
             chatbot[-1][1] = response
             yield chatbot
-        # thread.join()
-        # debug
-        print('*'*60)
-        print('*'*60)
-        print('OVIS_CONV_START')
-        for i, (request, answer) in enumerate(chatbot[:-1], 1):
-            print(f'Q{i}:\n {request}')
-            print(f'A{i}:\n {answer}')
-        print('New_Q:\n', text_input)
-        print('New_A:\n', response)
-        print('OVIS_CONV_END')
 def clear_chat():
     return [], None, ""
@@ -124,7 +150,7 @@ html = f"""
 latex_delimiters_set = [{
         "left": "\\(",
         "right": "\\)",
-        "display": True
     }, {
         "left": "\\begin{equation}",
         "right": "\\end{equation}",
@@ -159,9 +185,11 @@ with gr.Blocks(title=model_name.split('/')[-1], theme=gr.themes.Ocean()) as demo
             image_input = gr.Image(label="image", height=350, type="pil")
             gr.Examples(
                 examples=[
-                    [f"{cur_dir}/examples/case0.png", "Find the area of the shaded region."],
-                    [f"{cur_dir}/examples/case1.png", "explain this model to me."],
-                    [f"{cur_dir}/examples/case2.png", "What is net profit margin as a percentage of total revenue?"],
                 ],
                 inputs=[image_input, text_input]
             )

 import spaces
 import os
 import re
+import logging
+from typing import List, Any
 from threading import Thread
+import torch
+import gradio as gr
+from transformers import AutoModelForCausalLM, TextIteratorStreamer
 model_name = 'AIDC-AI/Ovis2-16B'
+use_thread = False
 # load model
 model = AutoModelForCausalLM.from_pretrained(model_name,
     chatbot.append((text_input, response))
     return chatbot ,''
+# @spaces.GPU
+use_thread = False
+# load model
+model = AutoModelForCausalLM.from_pretrained(model_name,
+                                             torch_dtype=torch.bfloat16,
+                                             multimodal_max_length=8192,
+                                             trust_remote_code=True).to(device='cuda')
+text_tokenizer = model.get_text_tokenizer()
+visual_tokenizer = model.get_visual_tokenizer()
+streamer = TextIteratorStreamer(text_tokenizer, skip_prompt=True, skip_special_tokens=True)
+image_placeholder = '<image>'
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def initialize_gen_kwargs():
+    return {
+        "max_new_tokens": 1536,
+        "do_sample": False,
+        "top_p": None,
+        "top_k": None,
+        "temperature": None,
+        "repetition_penalty": 1.05,
+        "eos_token_id": model.generation_config.eos_token_id,
+        "pad_token_id": text_tokenizer.pad_token_id,
+        "use_cache": True
+    }
+def submit_chat(chatbot, text_input):
+    response = ''
+    chatbot.append((text_input, response))
+    return chatbot ,''
+# @spaces.GPU
+def ovis_chat(chatbot: List[List[str]], image_input: Any):
+    conversations, model_inputs = prepare_inputs(chatbot, image_input)
+    gen_kwargs = initialize_gen_kwargs()
     with torch.inference_mode():
+        generate_func = lambda: model.generate(**model_inputs, **gen_kwargs, streamer=streamer)
+        if use_thread:
+            thread = Thread(target=generate_func)
+            thread.start()
+        else:
+            generate_func()
         response = ""
         for new_text in streamer:
             response += new_text
             chatbot[-1][1] = response
             yield chatbot
+        if use_thread:
+            thread.join()
+    log_conversation(chatbot)
+def prepare_inputs(chatbot: List[List[str]], image_input: Any):
+    # conversations = [{
+    #     "from": "system",
+    #     "value": "You are a helpful assistant, and your task is to provide reliable and structured responses to users."
+    # }]
+    conversations= []
+    for query, response in chatbot[:-1]:
+        conversations.extend([
+            {"from": "human", "value": query},
+            {"from": "gpt", "value": response}
+        ])
+    last_query = chatbot[-1][0].replace(image_placeholder, '')
+    conversations.append({"from": "human", "value": last_query})
+    if image_input is not None:
+        for conv in conversations:
+            if conv["from"] == "human":
+                conv["value"] = f'{image_placeholder}\n{conv["value"]}'
+                break
+    logger.info(conversations)
+    prompt, input_ids, pixel_values = model.preprocess_inputs(conversations, [image_input], max_partition=16)
+    attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
+    model_inputs = {
+        "inputs": input_ids.unsqueeze(0).to(device=model.device),
+        "attention_mask": attention_mask.unsqueeze(0).to(device=model.device),
+        "pixel_values": [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)] if image_input is not None else [None]
+    }
+    return conversations, model_inputs
+def log_conversation(chatbot):
+    logger.info("[OVIS_CONV_START]")
+    [print(f'Q{i}:\n {request}\nA{i}:\n {answer}') for i, (request, answer) in enumerate(chatbot, 1)]
+    logger.info("[OVIS_CONV_END]")
 def clear_chat():
     return [], None, ""
 latex_delimiters_set = [{
         "left": "\\(",
         "right": "\\)",
+        "display": False
     }, {
         "left": "\\begin{equation}",
         "right": "\\end{equation}",
             image_input = gr.Image(label="image", height=350, type="pil")
             gr.Examples(
                 examples=[
+                    [f"{cur_dir}/examples/ovis2_math0.jpg", "Each face of the polyhedron shown is either a triangle or a square. Each square borders 4 triangles, and each triangle borders 3 squares. The polyhedron has 6 squares. How many triangles does it have?\n\nProvide a step-by-step solution to the problem, and conclude with 'the answer is' followed by the final solution."],
+                    [f"{cur_dir}/examples/ovis2_math1.jpg", "A large square touches another two squares, as shown in the picture. The numbers inside the smaller squares indicate their areas. What is the area of the largest square?\n\nProvide a step-by-step solution to the problem, and conclude with 'the answer is' followed by the final solution."],
+                    [f"{cur_dir}/examples/ovis2_figure0.png", "Explain this model."],
+                    [f"{cur_dir}/examples/ovis2_figure1.png", "Extract the notes about PPO and GRPO in the figure, paying attention to readability."],
+                    [f"{cur_dir}/examples/ovis2_multi0.jpg", "Posso avere un frappuccino e un caffè americano di taglia M? Quanto costa in totale?"],
                 ],
                 inputs=[image_input, text_input]
             )

examples/{case1.png → ovis2_figure0.png} RENAMED Viewed

File without changes

examples/ovis2_figure1.png ADDED Viewed

Git LFS Details

SHA256: 424846820189aad49941d3efba10f3e66925a9c80204bab297bc3d120a0fed4d
Pointer size: 132 Bytes
Size of remote file: 1.21 MB

examples/{case0.png → ovis2_math0.jpg} RENAMED Viewed

File without changes

examples/{case2.png → ovis2_math1.jpg} RENAMED Viewed

File without changes

examples/ovis2_multi0.jpg ADDED Viewed

Git LFS Details

SHA256: 66f1f86d24b0f334f039165ebd1ec3e83cefcf7b8bea87e9ec2d42a09c1f84e5
Pointer size: 132 Bytes
Size of remote file: 3.41 MB