praeclarumjj3 commited on
Commit
cce2948
·
verified ·
1 Parent(s): a2dab9e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -7
app.py CHANGED
@@ -365,10 +365,17 @@ txt = gr.Textbox(
365
  )
366
 
367
 
 
368
  title = "<h1 style='margin-bottom: -10px; text-align: center'>OLA-VLM: Elevating Visual Perception in Multimodal LLMs with Auxiliary Embedding Distillation</h1>"
369
  description = "<p style='font-size: 16px; margin: 5px; font-weight: w300; text-align: center'> <a href='https://praeclarumjj3.github.io/' style='text-decoration:none' target='_blank'>Jitesh Jain</a> &nbsp;&nbsp <a href='https://zyang-ur.github.io/' style='text-decoration:none' target='_blank'>Zhengyuan Yang</a> &nbsp;&nbsp <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Humphrey Shi<sup>*</sup></a> &nbsp;&nbsp <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Jianfeng Gao<sup>*</sup></a> &nbsp;&nbsp <a href='https://jwyang.github.io/' style='text-decoration:none' target='_blank'>Jianwei Yang<sup>*</sup></a></p>" \
370
  + "<p style='font-size: 12px; margin: 5px; font-weight: w300; text-align: center'><sup>*</sup>Equal Advising</p>" \
371
- + "<p style='font-size: 16px; margin: 5px; font-weight: w600; text-align: center'> <a href='https://praeclarumjj3.github.io/ola_vlm/' target='_blank'>Project Page</a> | <a href='https://youtu.be/' target='_blank'>Video</a> | <a href='https://arxiv.org/abs/' target='_blank'>ArXiv</a> | <a href='https://github.com/SHI-Labs/OLA-VLM' target='_blank'>Github</a></p>"
 
 
 
 
 
 
372
 
373
  tos_markdown = ("""
374
  ### Terms of use
@@ -427,12 +434,12 @@ with gr.Blocks(title="OLA-VLM", theme=gr.themes.Default(), css=block_css) as dem
427
  clear_btn = gr.Button(value="🗑️ Clear", interactive=False)
428
  submit_btn = gr.Button(value="Send", variant="primary")
429
 
430
- with gr.Accordion("Representations from selected layers of the LLM (expects only a single image input)", open=False) as interm_out:
431
- inter_vis_btn = gr.Button(value="✨ Visualize")
432
- with gr.Row():
433
- depth_box = gr.Image(label="depth", type="pil", visible=True)
434
- seg_box = gr.Image(label="seg", type="pil", visible=True)
435
- gen_box = gr.Image(label="gen", type="pil", visible=True)
436
 
437
  gr.Examples(examples=[
438
  [f"assets/cars.jpg", "Which car is in front: the blue or the brown one?"],
 
365
  )
366
 
367
 
368
+
369
  title = "<h1 style='margin-bottom: -10px; text-align: center'>OLA-VLM: Elevating Visual Perception in Multimodal LLMs with Auxiliary Embedding Distillation</h1>"
370
  description = "<p style='font-size: 16px; margin: 5px; font-weight: w300; text-align: center'> <a href='https://praeclarumjj3.github.io/' style='text-decoration:none' target='_blank'>Jitesh Jain</a> &nbsp;&nbsp <a href='https://zyang-ur.github.io/' style='text-decoration:none' target='_blank'>Zhengyuan Yang</a> &nbsp;&nbsp <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Humphrey Shi<sup>*</sup></a> &nbsp;&nbsp <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Jianfeng Gao<sup>*</sup></a> &nbsp;&nbsp <a href='https://jwyang.github.io/' style='text-decoration:none' target='_blank'>Jianwei Yang<sup>*</sup></a></p>" \
371
  + "<p style='font-size: 12px; margin: 5px; font-weight: w300; text-align: center'><sup>*</sup>Equal Advising</p>" \
372
+ + "<p style='font-size: 16px; margin: 5px; font-weight: w600; text-align: center'> <a href='https://praeclarumjj3.github.io/ola_vlm/' target='_blank'>Project Page</a> | <a href='https://youtu.be/' target='_blank'>Video</a> | <a href='https://arxiv.org/abs/' target='_blank'>ArXiv</a> | <a href='https://github.com/SHI-Labs/OLA-VLM' target='_blank'>Github</a></p>" \
373
+ + "<p style='text-align: center; font-size: 16px; margin: 5px; font-weight: w300;'>OLA-VLM introduces a new approach to distilling vision knowledge into the hidden representations of LLMs, utilizing target visual representations to advance visual perception in multimodal LLMs. In the demo, along with the standard VQA setting, you can also visualize the intermediate representations from selected layers in OLA-VLM by clicking on the <code>✨ Visualize</code> button!</p>" \
374
+ + "<ul style='text-align: center; font-size: 16px; margin: 5px; font-weight: w300; list-style-type: none; padding: 0;'> \
375
+ <li><b>depth<b>: Visualizes the depth information in the representations using the decoder from the <a href='https://github.com/DepthAnything/Depth-Anything-V2' target='_blank'>Depth-Anything-v2 model</a>.</li> \
376
+ <li><b>seg<b>: Visualizes the segmentation information in the representations using the decoder from the <a href='https://github.com/SHI-Labs/OneFormer' target='_blank'>OneFormer model</a>.</li> \
377
+ <li><b>gen<b>: Visualizes the general information of the representations using the <a href='https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip' target='_blank'>SD-2.1-unCLIP</a>. Note that we use representations as a condition to the model, resulting in an image variation output due to the nature of unCLIP.</li> \
378
+ </ul>"
379
 
380
  tos_markdown = ("""
381
  ### Terms of use
 
434
  clear_btn = gr.Button(value="🗑️ Clear", interactive=False)
435
  submit_btn = gr.Button(value="Send", variant="primary")
436
 
437
+ # with gr.Accordion("Representations from selected layers of the LLM (expects only a single image input)", open=False) as interm_out:
438
+ inter_vis_btn = gr.Button(value="✨ Visualize")
439
+ with gr.Row():
440
+ depth_box = gr.Image(label="depth", type="pil", visible=True)
441
+ seg_box = gr.Image(label="seg", type="pil", visible=True)
442
+ gen_box = gr.Image(label="gen", type="pil", visible=True)
443
 
444
  gr.Examples(examples=[
445
  [f"assets/cars.jpg", "Which car is in front: the blue or the brown one?"],