Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | @@ -365,10 +365,17 @@ txt = gr.Textbox( | |
| 365 | 
             
            )
         | 
| 366 |  | 
| 367 |  | 
|  | |
| 368 | 
             
            title = "<h1 style='margin-bottom: -10px; text-align: center'>OLA-VLM: Elevating Visual Perception in Multimodal LLMs with Auxiliary Embedding Distillation</h1>"
         | 
| 369 | 
             
            description = "<p style='font-size: 16px; margin: 5px; font-weight: w300; text-align: center'> <a href='https://praeclarumjj3.github.io/' style='text-decoration:none' target='_blank'>Jitesh Jain</a>    <a href='https://zyang-ur.github.io/' style='text-decoration:none' target='_blank'>Zhengyuan Yang</a>    <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Humphrey Shi<sup>*</sup></a>    <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Jianfeng Gao<sup>*</sup></a>    <a href='https://jwyang.github.io/' style='text-decoration:none' target='_blank'>Jianwei Yang<sup>*</sup></a></p>" \
         | 
| 370 | 
             
                        + "<p style='font-size: 12px; margin: 5px; font-weight: w300; text-align: center'><sup>*</sup>Equal Advising</p>" \
         | 
| 371 | 
            -
                        + "<p style='font-size: 16px; margin: 5px; font-weight: w600; text-align: center'> <a href='https://praeclarumjj3.github.io/ola_vlm/' target='_blank'>Project Page</a> | <a href='https://youtu.be/' target='_blank'>Video</a> | <a href='https://arxiv.org/abs/' target='_blank'>ArXiv</a> | <a href='https://github.com/SHI-Labs/OLA-VLM' target='_blank'>Github</a></p>"
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 372 |  | 
| 373 | 
             
            tos_markdown = ("""
         | 
| 374 | 
             
            ### Terms of use
         | 
| @@ -427,12 +434,12 @@ with gr.Blocks(title="OLA-VLM", theme=gr.themes.Default(), css=block_css) as dem | |
| 427 | 
             
                            clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
         | 
| 428 | 
             
                            submit_btn = gr.Button(value="Send", variant="primary")
         | 
| 429 |  | 
| 430 | 
            -
                with gr.Accordion("Representations from selected layers of the LLM (expects only a single image input)", open=False) as interm_out:
         | 
| 431 | 
            -
             | 
| 432 | 
            -
             | 
| 433 | 
            -
             | 
| 434 | 
            -
             | 
| 435 | 
            -
             | 
| 436 |  | 
| 437 | 
             
                gr.Examples(examples=[
         | 
| 438 | 
             
                        [f"assets/cars.jpg", "Which car is in front: the blue or the brown one?"],
         | 
|  | |
| 365 | 
             
            )
         | 
| 366 |  | 
| 367 |  | 
| 368 | 
            +
             | 
| 369 | 
             
            title = "<h1 style='margin-bottom: -10px; text-align: center'>OLA-VLM: Elevating Visual Perception in Multimodal LLMs with Auxiliary Embedding Distillation</h1>"
         | 
| 370 | 
             
            description = "<p style='font-size: 16px; margin: 5px; font-weight: w300; text-align: center'> <a href='https://praeclarumjj3.github.io/' style='text-decoration:none' target='_blank'>Jitesh Jain</a>    <a href='https://zyang-ur.github.io/' style='text-decoration:none' target='_blank'>Zhengyuan Yang</a>    <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Humphrey Shi<sup>*</sup></a>    <a href='https://www.humphreyshi.com/home' style='text-decoration:none' target='_blank'>Jianfeng Gao<sup>*</sup></a>    <a href='https://jwyang.github.io/' style='text-decoration:none' target='_blank'>Jianwei Yang<sup>*</sup></a></p>" \
         | 
| 371 | 
             
                        + "<p style='font-size: 12px; margin: 5px; font-weight: w300; text-align: center'><sup>*</sup>Equal Advising</p>" \
         | 
| 372 | 
            +
                        + "<p style='font-size: 16px; margin: 5px; font-weight: w600; text-align: center'> <a href='https://praeclarumjj3.github.io/ola_vlm/' target='_blank'>Project Page</a> | <a href='https://youtu.be/' target='_blank'>Video</a> | <a href='https://arxiv.org/abs/' target='_blank'>ArXiv</a> | <a href='https://github.com/SHI-Labs/OLA-VLM' target='_blank'>Github</a></p>" \
         | 
| 373 | 
            +
                        + "<p style='text-align: center; font-size: 16px; margin: 5px; font-weight: w300;'>OLA-VLM introduces a new approach to distilling vision knowledge into the hidden representations of LLMs, utilizing target visual representations to advance visual perception in multimodal LLMs. In the demo, along with the standard VQA setting, you can also visualize the intermediate representations from selected layers in OLA-VLM by clicking on the <code>✨ Visualize</code> button!</p>" \
         | 
| 374 | 
            +
                        + "<ul style='text-align: center; font-size: 16px; margin: 5px; font-weight: w300; list-style-type: none; padding: 0;'> \
         | 
| 375 | 
            +
                             <li><b>depth<b>: Visualizes the depth information in the representations using the decoder from the <a href='https://github.com/DepthAnything/Depth-Anything-V2' target='_blank'>Depth-Anything-v2 model</a>.</li> \
         | 
| 376 | 
            +
                             <li><b>seg<b>: Visualizes the segmentation information in the representations using the decoder from the <a href='https://github.com/SHI-Labs/OneFormer' target='_blank'>OneFormer model</a>.</li> \
         | 
| 377 | 
            +
                             <li><b>gen<b>: Visualizes the general information of the representations using the <a href='https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip' target='_blank'>SD-2.1-unCLIP</a>. Note that we use representations as a condition to the model, resulting in an image variation output due to the nature of unCLIP.</li> \
         | 
| 378 | 
            +
                        </ul>"
         | 
| 379 |  | 
| 380 | 
             
            tos_markdown = ("""
         | 
| 381 | 
             
            ### Terms of use
         | 
|  | |
| 434 | 
             
                            clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
         | 
| 435 | 
             
                            submit_btn = gr.Button(value="Send", variant="primary")
         | 
| 436 |  | 
| 437 | 
            +
                # with gr.Accordion("Representations from selected layers of the LLM (expects only a single image input)", open=False) as interm_out:
         | 
| 438 | 
            +
                inter_vis_btn = gr.Button(value="✨ Visualize")
         | 
| 439 | 
            +
                with gr.Row():
         | 
| 440 | 
            +
                    depth_box = gr.Image(label="depth", type="pil", visible=True)
         | 
| 441 | 
            +
                    seg_box = gr.Image(label="seg", type="pil", visible=True)
         | 
| 442 | 
            +
                    gen_box = gr.Image(label="gen", type="pil", visible=True)
         | 
| 443 |  | 
| 444 | 
             
                gr.Examples(examples=[
         | 
| 445 | 
             
                        [f"assets/cars.jpg", "Which car is in front: the blue or the brown one?"],
         | 

