Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	upload app (#2)
Browse files- upload app (10eb745bd5ef5e8abf5d771a79ef2732cd570ecd)
- .gitattributes +2 -0
 - app.py +161 -0
 - images/1.png +3 -0
 - images/2.jpg +0 -0
 - images/3.jpg +3 -0
 - images/4.png +0 -0
 - requirements.txt +16 -0
 
    	
        .gitattributes
    CHANGED
    
    | 
         @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text 
     | 
|
| 33 | 
         
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         
     | 
| 34 | 
         
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         
     | 
| 35 | 
         
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         
     | 
| 
         | 
|
| 
         | 
| 
         | 
|
| 33 | 
         
             
            *.zip filter=lfs diff=lfs merge=lfs -text
         
     | 
| 34 | 
         
             
            *.zst filter=lfs diff=lfs merge=lfs -text
         
     | 
| 35 | 
         
             
            *tfevents* filter=lfs diff=lfs merge=lfs -text
         
     | 
| 36 | 
         
            +
            images/1.png filter=lfs diff=lfs merge=lfs -text
         
     | 
| 37 | 
         
            +
            images/3.jpg filter=lfs diff=lfs merge=lfs -text
         
     | 
    	
        app.py
    ADDED
    
    | 
         @@ -0,0 +1,161 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            import os
         
     | 
| 2 | 
         
            +
            import time
         
     | 
| 3 | 
         
            +
            import threading
         
     | 
| 4 | 
         
            +
            import gradio as gr
         
     | 
| 5 | 
         
            +
            import spaces
         
     | 
| 6 | 
         
            +
            import torch
         
     | 
| 7 | 
         
            +
            from PIL import Image
         
     | 
| 8 | 
         
            +
            from transformers import (
         
     | 
| 9 | 
         
            +
                AutoModelForImageTextToText,
         
     | 
| 10 | 
         
            +
                AutoProcessor,
         
     | 
| 11 | 
         
            +
                TextIteratorStreamer,
         
     | 
| 12 | 
         
            +
            )
         
     | 
| 13 | 
         
            +
            from transformers.image_utils import load_image
         
     | 
| 14 | 
         
            +
             
     | 
| 15 | 
         
            +
            # Constants for text generation
         
     | 
| 16 | 
         
            +
            MAX_MAX_NEW_TOKENS = 4096
         
     | 
| 17 | 
         
            +
            DEFAULT_MAX_NEW_TOKENS = 1024
         
     | 
| 18 | 
         
            +
            MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
         
     | 
| 19 | 
         
            +
             
     | 
| 20 | 
         
            +
            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
         
     | 
| 21 | 
         
            +
             
     | 
| 22 | 
         
            +
            # Load LFM2-VL-1.6B
         
     | 
| 23 | 
         
            +
            MODEL_ID_M = "LiquidAI/LFM2-VL-1.6B"
         
     | 
| 24 | 
         
            +
            processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
         
     | 
| 25 | 
         
            +
            model_m = AutoModelForImageTextToText.from_pretrained(
         
     | 
| 26 | 
         
            +
                MODEL_ID_M,
         
     | 
| 27 | 
         
            +
                trust_remote_code=True,
         
     | 
| 28 | 
         
            +
                torch_dtype="bfloat16",
         
     | 
| 29 | 
         
            +
            ).to(device).eval()
         
     | 
| 30 | 
         
            +
             
     | 
| 31 | 
         
            +
            # Load LFM2-VL-450M
         
     | 
| 32 | 
         
            +
            MODEL_ID_T = "LiquidAI/LFM2-VL-450M"
         
     | 
| 33 | 
         
            +
            processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
         
     | 
| 34 | 
         
            +
            model_t = AutoModelForImageTextToText.from_pretrained(
         
     | 
| 35 | 
         
            +
                MODEL_ID_T,
         
     | 
| 36 | 
         
            +
                trust_remote_code=True,
         
     | 
| 37 | 
         
            +
                torch_dtype="bfloat16",
         
     | 
| 38 | 
         
            +
            ).to(device).eval()
         
     | 
| 39 | 
         
            +
             
     | 
| 40 | 
         
            +
            @spaces.GPU
         
     | 
| 41 | 
         
            +
            def generate_image(model_name: str, text: str, image: Image.Image,
         
     | 
| 42 | 
         
            +
                               max_new_tokens: int = 1024,
         
     | 
| 43 | 
         
            +
                               temperature: float = 0.6,
         
     | 
| 44 | 
         
            +
                               top_p: float = 0.9,
         
     | 
| 45 | 
         
            +
                               top_k: int = 50,
         
     | 
| 46 | 
         
            +
                               repetition_penalty: float = 1.2):
         
     | 
| 47 | 
         
            +
                """
         
     | 
| 48 | 
         
            +
                Generate responses using the selected model for image input.
         
     | 
| 49 | 
         
            +
                """
         
     | 
| 50 | 
         
            +
                if model_name == "LFM2-VL-1.6B":
         
     | 
| 51 | 
         
            +
                    processor = processor_m
         
     | 
| 52 | 
         
            +
                    model = model_m
         
     | 
| 53 | 
         
            +
                elif model_name == "LFM2-VL-450M":
         
     | 
| 54 | 
         
            +
                    processor = processor_t
         
     | 
| 55 | 
         
            +
                    model = model_t
         
     | 
| 56 | 
         
            +
                else:
         
     | 
| 57 | 
         
            +
                    yield "Invalid model selected.", "Invalid model selected."
         
     | 
| 58 | 
         
            +
                    return
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
                if image is None:
         
     | 
| 61 | 
         
            +
                    yield "Please upload an image.", "Please upload an image."
         
     | 
| 62 | 
         
            +
                    return
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
                messages = [{
         
     | 
| 65 | 
         
            +
                    "role": "user",
         
     | 
| 66 | 
         
            +
                    "content": [
         
     | 
| 67 | 
         
            +
                        {"type": "image", "image": image},
         
     | 
| 68 | 
         
            +
                        {"type": "text", "text": text},
         
     | 
| 69 | 
         
            +
                    ]
         
     | 
| 70 | 
         
            +
                }]
         
     | 
| 71 | 
         
            +
                prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         
     | 
| 72 | 
         
            +
                inputs = processor(
         
     | 
| 73 | 
         
            +
                    text=[prompt_full],
         
     | 
| 74 | 
         
            +
                    images=[image],
         
     | 
| 75 | 
         
            +
                    return_tensors="pt",
         
     | 
| 76 | 
         
            +
                    padding=True,
         
     | 
| 77 | 
         
            +
                    truncation=False,
         
     | 
| 78 | 
         
            +
                    max_length=MAX_INPUT_TOKEN_LENGTH
         
     | 
| 79 | 
         
            +
                ).to(device)
         
     | 
| 80 | 
         
            +
                streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
         
     | 
| 81 | 
         
            +
                generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
         
     | 
| 82 | 
         
            +
                thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
         
     | 
| 83 | 
         
            +
                thread.start()
         
     | 
| 84 | 
         
            +
                buffer = ""
         
     | 
| 85 | 
         
            +
                for new_text in streamer:
         
     | 
| 86 | 
         
            +
                    buffer += new_text
         
     | 
| 87 | 
         
            +
                    time.sleep(0.01)
         
     | 
| 88 | 
         
            +
                    yield buffer, buffer
         
     | 
| 89 | 
         
            +
             
     | 
| 90 | 
         
            +
            # Define examples for image inference
         
     | 
| 91 | 
         
            +
            image_examples = [
         
     | 
| 92 | 
         
            +
                ["According to this diagram, where do severe droughts occur?", "images/1.png"],
         
     | 
| 93 | 
         
            +
                ["Could you describe this image?", "images/2.jpg"],
         
     | 
| 94 | 
         
            +
                ["Provide a description of this image.", "images/3.jpg"],
         
     | 
| 95 | 
         
            +
                ["Explain the movie shot in detail.", "images/4.png"],
         
     | 
| 96 | 
         
            +
            ]
         
     | 
| 97 | 
         
            +
             
     | 
| 98 | 
         
            +
            # Updated CSS with model choice highlighting
         
     | 
| 99 | 
         
            +
            css = """
         
     | 
| 100 | 
         
            +
            .submit-btn {
         
     | 
| 101 | 
         
            +
                background-color: #2980b9 !important;
         
     | 
| 102 | 
         
            +
                color: white !important;
         
     | 
| 103 | 
         
            +
            }
         
     | 
| 104 | 
         
            +
            .submit-btn:hover {
         
     | 
| 105 | 
         
            +
                background-color: #3498db !important;
         
     | 
| 106 | 
         
            +
            }
         
     | 
| 107 | 
         
            +
            .canvas-output {
         
     | 
| 108 | 
         
            +
                border: 2px solid #4682B4;
         
     | 
| 109 | 
         
            +
                border-radius: 10px;
         
     | 
| 110 | 
         
            +
                padding: 20px;
         
     | 
| 111 | 
         
            +
            }
         
     | 
| 112 | 
         
            +
            """
         
     | 
| 113 | 
         
            +
             
     | 
| 114 | 
         
            +
            # Create the Gradio Interface
         
     | 
| 115 | 
         
            +
            with gr.Blocks(css=css, theme=gr.themes.Soft()) as demo:
         
     | 
| 116 | 
         
            +
                gr.Markdown("# **LFM2-VL by [LiquidAI](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa)**")
         
     | 
| 117 | 
         
            +
                with gr.Row():
         
     | 
| 118 | 
         
            +
                    with gr.Column():
         
     | 
| 119 | 
         
            +
                        image_query = gr.Textbox(label="Query Input", placeholder="✦︎ Enter your query")
         
     | 
| 120 | 
         
            +
                        image_upload = gr.Image(type="pil", label="Image")
         
     | 
| 121 | 
         
            +
                        image_submit = gr.Button("Submit", elem_classes="submit-btn")
         
     | 
| 122 | 
         
            +
                        gr.Examples(
         
     | 
| 123 | 
         
            +
                            examples=image_examples,
         
     | 
| 124 | 
         
            +
                            inputs=[image_query, image_upload]
         
     | 
| 125 | 
         
            +
                        )
         
     | 
| 126 | 
         
            +
             
     | 
| 127 | 
         
            +
                        with gr.Accordion("Advanced options", open=False):
         
     | 
| 128 | 
         
            +
                            max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
         
     | 
| 129 | 
         
            +
                            temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
         
     | 
| 130 | 
         
            +
                            top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
         
     | 
| 131 | 
         
            +
                            top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
         
     | 
| 132 | 
         
            +
                            repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
         
     | 
| 133 | 
         
            +
             
     | 
| 134 | 
         
            +
                    with gr.Column():
         
     | 
| 135 | 
         
            +
                        with gr.Column(elem_classes="canvas-output"):
         
     | 
| 136 | 
         
            +
                            gr.Markdown("## Output")
         
     | 
| 137 | 
         
            +
                            output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
         
     | 
| 138 | 
         
            +
                            with gr.Accordion("(Result.md)", open=False):                
         
     | 
| 139 | 
         
            +
                                markdown_output = gr.Markdown(label="(Result.md)")
         
     | 
| 140 | 
         
            +
             
     | 
| 141 | 
         
            +
                        model_choice = gr.Dropdown(
         
     | 
| 142 | 
         
            +
                            choices=["LFM2-VL-1.6B", "LFM2-VL-450M"],
         
     | 
| 143 | 
         
            +
                            label="Select Model",
         
     | 
| 144 | 
         
            +
                            value="LFM2-VL-1.6B"
         
     | 
| 145 | 
         
            +
                        )
         
     | 
| 146 | 
         
            +
             
     | 
| 147 | 
         
            +
                        gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/LFM2-VL-Demo/discussions)")
         
     | 
| 148 | 
         
            +
                        gr.Markdown("> [LFM2‑VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) is [Liquid AI’s](https://huggingface.co/LiquidAI) first multimodal model series, featuring models with 450M and 1.6B parameters designed for efficient processing of both text and images at native resolutions up to 512×512, ideal for low-latency edge AI applications; leveraging a hybrid conv+attention LFM2 backbone and SigLIP2 NaFlex vision encoders, it delivers flexible, user-tunable inference with rapid speeds (2× faster than existing VLMs on GPU)")
         
     | 
| 149 | 
         
            +
                        gr.Markdown("> Competitive accuracy, and dynamic image tokenization for scalable throughput, while supporting 32,768 text tokens and English language generation, and is best fine-tuned for targeted use cases using provided supervised fine-tuning tools, all released under the LFM Open License v1.0 for research and deployment scenarios not requiring safety-critical guarantees.")
         
     | 
| 150 | 
         
            +
             
     | 
| 151 | 
         
            +
                # Define the submit button action
         
     | 
| 152 | 
         
            +
                image_submit.click(fn=generate_image,
         
     | 
| 153 | 
         
            +
                                   inputs=[
         
     | 
| 154 | 
         
            +
                                       model_choice, image_query, image_upload,
         
     | 
| 155 | 
         
            +
                                       max_new_tokens, temperature, top_p, top_k,
         
     | 
| 156 | 
         
            +
                                       repetition_penalty
         
     | 
| 157 | 
         
            +
                                   ],
         
     | 
| 158 | 
         
            +
                                   outputs=[output, markdown_output])
         
     | 
| 159 | 
         
            +
             
     | 
| 160 | 
         
            +
            if __name__ == "__main__":
         
     | 
| 161 | 
         
            +
                demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
         
     | 
    	
        images/1.png
    ADDED
    
    
											 
									 | 
									
								
											Git LFS Details
  | 
									
    	
        images/2.jpg
    ADDED
    
    
											 
									 | 
									
								
    	
        images/3.jpg
    ADDED
    
    
											 
									 | 
									
								
											Git LFS Details
  | 
									
    	
        images/4.png
    ADDED
    
    
											 
									 | 
									
								
    	
        requirements.txt
    ADDED
    
    | 
         @@ -0,0 +1,16 @@ 
     | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 1 | 
         
            +
            av
         
     | 
| 2 | 
         
            +
            peft
         
     | 
| 3 | 
         
            +
            torch
         
     | 
| 4 | 
         
            +
            spaces
         
     | 
| 5 | 
         
            +
            gradio
         
     | 
| 6 | 
         
            +
            pillow
         
     | 
| 7 | 
         
            +
            requests
         
     | 
| 8 | 
         
            +
            accelerate
         
     | 
| 9 | 
         
            +
            safetensors
         
     | 
| 10 | 
         
            +
            torchvision
         
     | 
| 11 | 
         
            +
            transformers
         
     | 
| 12 | 
         
            +
            huggingface_hub
         
     | 
| 13 | 
         
            +
            opencv-python
         
     | 
| 14 | 
         
            +
            sentencepiece
         
     | 
| 15 | 
         
            +
            qwen-vl-utils
         
     | 
| 16 | 
         
            +
            transformers-stream-generator
         
     |