Spaces:

alexnasa
/

SuperResolution

Running on Zero

App Files Files Community

alexnasa commited on Jul 12

Commit

a2bb6b2

verified ·

1 Parent(s): 6846357

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -0

app.py CHANGED Viewed

@@ -9,6 +9,9 @@ import numpy as np
 from PIL import Image
 import torch
 print(f'torch version:{torch.__version__}')
@@ -47,6 +50,69 @@ from torchvision import transforms
 from models.controlnet import ControlNetModel
 from models.unet_2d_condition import UNet2DConditionModel
 tensor_transforms = transforms.Compose([
                 transforms.ToTensor(),
             ])

 from PIL import Image
 import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info
 print(f'torch version:{torch.__version__}')
 from models.controlnet import ControlNetModel
 from models.unet_2d_condition import UNet2DConditionModel
+VLM_NAME  = "Qwen/Qwen2.5-VL-3B-Instruct"
+vlm_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    VLM_NAME,
+    torch_dtype="auto",
+    device_map="auto"   # immediately dispatches layers onto available GPUs
+)
+vlm_processor = AutoProcessor.from_pretrained(VLM_NAME)
+def _generate_vlm_prompt(
+    vlm_model: Qwen2_5_VLForConditionalGeneration,
+    vlm_processor: AutoProcessor,
+    process_vision_info,
+    pil_image: Image.Image,
+    device: str = "cuda"
+) -> str:
+    """
+    Given two PIL.Image inputs:
+      - prev_pil:   the “full” image at the previous recursion.
+      - zoomed_pil: the cropped+resized (zoom) image for this step.
+    Returns a single “recursive_multiscale” prompt string.
+    """
+    message_text = (
+        "The give a detailed description of this image as a caption."
+    )
+    messages = [
+        {"role": "system", "content": message_text},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": pil_image},
+            ],
+        },
+    ]
+    text = vlm_processor.apply_chat_template(
+        messages,
+        tokenize=False,
+        add_generation_prompt=True
+    )
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = vlm_processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    ).to(device)
+    generated = vlm_model.generate(**inputs, max_new_tokens=128)
+    trimmed = [
+        out_ids[len(in_ids):]
+        for in_ids, out_ids in zip(inputs.input_ids, generated)
+    ]
+    out_text = vlm_processor.batch_decode(
+        trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )[0]
+    return out_text.strip()
 tensor_transforms = transforms.Compose([
                 transforms.ToTensor(),
             ])