Spaces:

MohamedRashad
/

Infinity

Running on Zero

App Files Files Community

MohamedRashad commited on Jan 6

Commit

9d8246c

1 Parent(s): 9508399

Refactoring

Browse files

Files changed (1) hide show

app.py +140 -135

app.py CHANGED Viewed

@@ -8,73 +8,33 @@ import os.path as osp
 import time
 import hashlib
 import argparse
 import random
 from pathlib import Path
-from typing import List, Dict, Optional
-from dataclasses import dataclass
 import cv2
 import numpy as np
 import torch
 import torch.nn.functional as F
 from PIL import Image, ImageEnhance
 from torchvision.transforms.functional import to_tensor
 from transformers import AutoTokenizer, T5EncoderModel, T5TokenizerFast
 from huggingface_hub import hf_hub_download
 import gradio as gr
 import spaces
-import json
 from models.infinity import Infinity
 from models.basic import *
-from utils.dynamic_resolution import dynamic_resolution_h_w
 from gradio_client import Client
-# Performance optimizations
 torch._dynamo.config.cache_size_limit = 64
-torch.backends.cudnn.benchmark = True  # Enable cudnn auto-tuner
 client = Client("Qwen/Qwen2.5-72B-Instruct")
-@dataclass
-class ModelConfig:
-    """Configuration for Infinity model."""
-    depth: int
-    embed_dim: int
-    num_heads: int
-    drop_path_rate: float = 0.1
-    mlp_ratio: float = 4.0
-    block_chunks: int = 8
-    @classmethod
-    def from_type(cls, model_type: str) -> 'ModelConfig':
-        """Create model config from predefined types."""
-        configs = {
-            'infinity_2b': dict(depth=32, embed_dim=2048, num_heads=2048//128),
-            'infinity_layer12': dict(depth=12, embed_dim=768, num_heads=8),
-            'infinity_layer16': dict(depth=16, embed_dim=1152, num_heads=12),
-            'infinity_layer24': dict(depth=24, embed_dim=1536, num_heads=16),
-            'infinity_layer32': dict(depth=32, embed_dim=2080, num_heads=20),
-            'infinity_layer40': dict(depth=40, embed_dim=2688, num_heads=24),
-            'infinity_layer48': dict(depth=48, embed_dim=3360, num_heads=28),
-        }
-        if model_type not in configs:
-            raise ValueError(f"Unknown model type: {model_type}")
-        return cls(**configs[model_type])
-    def to_dict(self) -> Dict:
-        """Convert config to dictionary."""
-        return {
-            'depth': self.depth,
-            'embed_dim': self.embed_dim,
-            'num_heads': self.num_heads,
-            'drop_path_rate': self.drop_path_rate,
-            'mlp_ratio': self.mlp_ratio,
-            'block_chunks': self.block_chunks
-        }
-# Global device configuration
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 # Define a function to download weights if not present
 def download_infinity_weights(weights_path):
     try:
@@ -89,19 +49,7 @@ def download_infinity_weights(weights_path):
     except Exception as e:
         print(f"Error downloading weights: {e}")
-def extract_key_val(text):
-    pattern = r'<(.+?):(.+?)>'
-    matches = re.findall(pattern, text)
-    key_val = {}
-    for match in matches:
-        key_val[match[0]] = match[1].lstrip()
-    return key_val
-def encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt=False):
-    if enable_positive_prompt:
-        print(f'before positive_prompt aug: {prompt}')
-        prompt = aug_with_positive_prompt(prompt)
-        print(f'after positive_prompt aug: {prompt}')
     print(f'prompt={prompt}')
     captions = [prompt]
     tokens = text_tokenizer(text=captions, max_length=512, padding='max_length', truncation=True, return_tensors='pt')  # todo: put this into dataset
@@ -118,14 +66,6 @@ def encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt=F
     text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext)
     return text_cond_tuple
-def aug_with_positive_prompt(prompt):
-    for key in ['man', 'woman', 'men', 'women', 'boy', 'girl', 'child', 'person', 'human', 'adult', 'teenager', 'employee',
-                'employer', 'worker', 'mother', 'father', 'sister', 'brother', 'grandmother', 'grandfather', 'son', 'daughter']:
-        if key in prompt:
-            prompt = prompt + '. very smooth faces, good looking faces, face to the camera, perfect facial features'
-            break
-    return prompt
 def enhance_image(image):
     for t in range(1):
         contrast_image = image.copy()
@@ -136,6 +76,71 @@ def enhance_image(image):
         color_image = color_enhancer.enhance(1.05)  # 增强饱和度
     return color_image
 def get_prompt_id(prompt):
     md5 = hashlib.md5()
     md5.update(prompt.encode('utf-8'))
@@ -159,7 +164,7 @@ def load_tokenizer(t5_path =''):
     text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(t5_path, revision=None, legacy=True)
     text_tokenizer.model_max_length = 512
     text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
-    text_encoder.to(DEVICE)
     text_encoder.eval()
     text_encoder.requires_grad_(False)
     return text_tokenizer, text_encoder
@@ -174,6 +179,7 @@ def load_infinity(
     model_path='',
     scale_schedule=None,
     vae=None,
     model_kwargs=None,
     text_channels=2048,
     apply_spatial_patchify=0,
@@ -182,8 +188,13 @@ def load_infinity(
 ):
     print(f'[Loading Infinity]')
     # Set autocast dtype based on bf16 and device support
-    if bf16 and DEVICE.type == 'cuda' and torch.cuda.is_bf16_supported():
         autocast_dtype = torch.bfloat16
     else:
         autocast_dtype = torch.float32
@@ -192,7 +203,7 @@ def load_infinity(
     text_maxlen = 512
     torch.cuda.empty_cache()
-    with torch.amp.autocast(device_type=DEVICE.type, dtype=autocast_dtype), torch.no_grad():
         infinity_test: Infinity = Infinity(
             vae_local=vae, text_channels=text_channels, text_maxlen=text_maxlen,
             shared_aln=True, raw_scale_schedule=scale_schedule,
@@ -210,7 +221,7 @@ def load_infinity(
             inference_mode=True,
             train_h_div_w_list=[1.0],
             **model_kwargs,
-        ).to(DEVICE)
         print(f'[you selected Infinity with {model_kwargs=}] model size: {sum(p.numel() for p in infinity_test.parameters())/1e9:.2f}B, bf16={bf16}')
@@ -222,11 +233,11 @@ def load_infinity(
         infinity_test.requires_grad_(False)
         print(f'[Load Infinity weights]')
-        state_dict = torch.load(model_path, map_location=DEVICE)
         print(infinity_test.load_state_dict(state_dict))
-        # Initialize random number generator
-        infinity_test.rng = torch.Generator(device=DEVICE)
         return infinity_test
@@ -268,7 +279,7 @@ def joint_vi_vae_encode_decode(vae, image_path, scale_schedule, device, tgt_h, t
     return gt_img, recons_img, all_bit_indices
 def load_visual_tokenizer(args):
-    device = DEVICE
     # load vae
     if args.vae_type in [16,18,20,24,32,64]:
         from models.bsq_vae.vae import vae_model
@@ -311,7 +322,7 @@ def load_transformer(vae, args):
                     if not osp.exists(local_model_path):
                         print(f'copy {model_path} to {local_model_path}')
                         shutil.copyfile(model_path, local_model_path)
-                    save_slim_model(local_model_path, save_file=local_slim_model_path, device=DEVICE)
                     print(f'copy {local_slim_model_path} to {slim_model_path}')
                     if not osp.exists(slim_model_path):
                         shutil.copyfile(local_slim_model_path, slim_model_path)
@@ -322,7 +333,20 @@ def load_transformer(vae, args):
             slim_model_path = model_path
         print(f'load checkpoint from {slim_model_path}')
-    model_config = ModelConfig.from_type(args.model_type)
     infinity = load_infinity(
         rope2d_each_sa_layer=args.rope2d_each_sa_layer,
         rope2d_normalized_by_hw=args.rope2d_normalized_by_hw,
@@ -333,7 +357,8 @@ def load_transformer(vae, args):
         model_path=slim_model_path,
         scale_schedule=None,
         vae=vae,
-        model_kwargs=model_config.to_dict(),
         text_channels=args.text_channels,
         apply_spatial_patchify=args.apply_spatial_patchify,
         use_flex_attn=args.use_flex_attn,
@@ -400,6 +425,10 @@ weights_path = Path(__file__).parent / 'weights'
 weights_path.mkdir(exist_ok=True)
 download_infinity_weights(weights_path)
 # Define args
 args = argparse.Namespace(
     pn='1M',
@@ -421,7 +450,7 @@ args = argparse.Namespace(
     cache_dir='/dev/shm',
     checkpoint_type='torch',
     seed=0,
-    bf16=1 if torch.bfloat16 == torch.get_default_dtype() else 0,
     save_file='tmp.jpg',
     enable_model_cache=False,
 )
@@ -433,62 +462,43 @@ infinity = load_transformer(vae, args)
 # Define the image generation function
 @spaces.GPU
-def generate_image(prompt, cfg, tau, h_div_w, seed, enable_positive_prompt=False):
-    """Generate an image from a prompt with integrated generation logic."""
     try:
-        # Set random seed for reproducibility
-        if seed is not None:
-            torch.manual_seed(seed)
-            random.seed(seed)
-            np.random.seed(seed)
-        # Calculate image dimensions
-        tgt_h, tgt_w = dynamic_resolution_h_w(h_div_w)
-        scale_schedule = None
-        # Process text prompt
-        text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt)
-        # Set up negative prompt if needed
-        negative_prompt = ''
-        if negative_prompt:
-            negative_cond_tuple = encode_prompt(text_tokenizer, text_encoder, negative_prompt)
-            negative_label_B_or_BLT = negative_cond_tuple[0]
-        else:
-            negative_label_B_or_BLT = None
-        print(f'cfg: {cfg}, tau: {tau}')
-        # Generate image with automatic mixed precision
-        with torch.amp.autocast(device_type=DEVICE.type, dtype=torch.bfloat16):
-            stt = time.time()
-            _, _, img_list = infinity.autoregressive_infer_cfg(
-                vae=vae,
-                text_cond_tuple=text_cond_tuple,
-                negative_label_B_or_BLT=negative_label_B_or_BLT,
-                cfg_list=[cfg],
-                tau_list=[tau],
-                top_k=900,
-                top_p=0.97,
-                cfg_sc=3,
-                cfg_exp_k=0.0,
-                cfg_insertion_layer=[args.cfg_insertion_layer],
-                vae_type=args.vae_type,
-                gumbel=0,
-                softmax_merge_topk=-1,
-                gt_leak=0,
-                gt_ls_Bl=None,
-                g_seed=seed,
-                sampling_per_bits=args.sampling_per_bits,
-                scale_schedule=scale_schedule,
-            )
-            print(f'inference time: {time.time()-stt:.3f}s')
-        # Convert the image efficiently
-        with torch.no_grad():
-            image = img_list[0].cpu().numpy()
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            image = np.uint8(image)
         return image
     except Exception as e:
@@ -498,11 +508,6 @@ def generate_image(prompt, cfg, tau, h_div_w, seed, enable_positive_prompt=False
 # Set up Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("<h1><center>Infinity Image Generator</center></h1>")
-    gr.Markdown("### Instructions")
-    gr.Markdown("1. Enter a prompt in the **Prompt Settings** section.")
-    gr.Markdown("2. Click the **Enhance Prompt** button to generate a more creative and detailed prompt.")
-    gr.Markdown("3. Adjust the **Image Settings** as desired.")
-    gr.Markdown("4. Click the **Generate Image** button to generate the image on the right.")
     with gr.Row():
         with gr.Column():

 import time
 import hashlib
 import argparse
+import shutil
+import re
 import random
 from pathlib import Path
+from typing import List
+import json
 import cv2
 import numpy as np
 import torch
 import torch.nn.functional as F
 from PIL import Image, ImageEnhance
+import PIL.Image as PImage
 from torchvision.transforms.functional import to_tensor
 from transformers import AutoTokenizer, T5EncoderModel, T5TokenizerFast
 from huggingface_hub import hf_hub_download
 import gradio as gr
 import spaces
 from models.infinity import Infinity
 from models.basic import *
+from utils.dynamic_resolution import dynamic_resolution_h_w, h_div_w_templates
 from gradio_client import Client
 torch._dynamo.config.cache_size_limit = 64
 client = Client("Qwen/Qwen2.5-72B-Instruct")
 # Define a function to download weights if not present
 def download_infinity_weights(weights_path):
     try:
     except Exception as e:
         print(f"Error downloading weights: {e}")
+def encode_prompt(text_tokenizer, text_encoder, prompt):
     print(f'prompt={prompt}')
     captions = [prompt]
     tokens = text_tokenizer(text=captions, max_length=512, padding='max_length', truncation=True, return_tensors='pt')  # todo: put this into dataset
     text_cond_tuple = (kv_compact, lens, cu_seqlens_k, Ltext)
     return text_cond_tuple
 def enhance_image(image):
     for t in range(1):
         contrast_image = image.copy()
         color_image = color_enhancer.enhance(1.05)  # 增强饱和度
     return color_image
+def gen_one_img(
+    infinity_test,
+    vae,
+    text_tokenizer,
+    text_encoder,
+    prompt,
+    cfg_list=[],
+    tau_list=[],
+    negative_prompt='',
+    scale_schedule=None,
+    top_k=900,
+    top_p=0.97,
+    cfg_sc=3,
+    cfg_exp_k=0.0,
+    cfg_insertion_layer=-5,
+    vae_type=0,
+    gumbel=0,
+    softmax_merge_topk=-1,
+    gt_leak=-1,
+    gt_ls_Bl=None,
+    g_seed=None,
+    sampling_per_bits=1,
+):
+    sstt = time.time()
+    if not isinstance(cfg_list, list):
+        cfg_list = [cfg_list] * len(scale_schedule)
+    if not isinstance(tau_list, list):
+        tau_list = [tau_list] * len(scale_schedule)
+    text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt)
+    if negative_prompt:
+        negative_label_B_or_BLT = encode_prompt(text_tokenizer, text_encoder, negative_prompt)
+    else:
+        negative_label_B_or_BLT = None
+    print(f'cfg: {cfg_list}, tau: {tau_list}')
+    # Set device if not provided
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Set autocast dtype based on bf16 and device support
+    if device == 'cuda' and torch.cuda.is_bf16_supported():
+        autocast_dtype = torch.bfloat16
+    else:
+        autocast_dtype = torch.float32
+    torch.cuda.empty_cache()
+    with torch.amp.autocast(device_type=device, dtype=autocast_dtype), torch.no_grad():
+        stt = time.time()
+        _, _, img_list = infinity_test.autoregressive_infer_cfg(
+            vae=vae,
+            scale_schedule=scale_schedule,
+            label_B_or_BLT=text_cond_tuple, g_seed=g_seed,
+            B=1, negative_label_B_or_BLT=negative_label_B_or_BLT, force_gt_Bhw=None,
+            cfg_sc=cfg_sc, cfg_list=cfg_list, tau_list=tau_list, top_k=top_k, top_p=top_p,
+            returns_vemb=1, ratio_Bl1=None, gumbel=gumbel, norm_cfg=False,
+            cfg_exp_k=cfg_exp_k, cfg_insertion_layer=cfg_insertion_layer,
+            vae_type=vae_type, softmax_merge_topk=softmax_merge_topk,
+            ret_img=True, trunk_scale=1000,
+            gt_leak=gt_leak, gt_ls_Bl=gt_ls_Bl, inference_mode=True,
+            sampling_per_bits=sampling_per_bits,
+        )
+    print(f"cost: {time.time() - sstt}, infinity cost={time.time() - stt}")
+    img = img_list[0]
+    return img
 def get_prompt_id(prompt):
     md5 = hashlib.md5()
     md5.update(prompt.encode('utf-8'))
     text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(t5_path, revision=None, legacy=True)
     text_tokenizer.model_max_length = 512
     text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
+    text_encoder.to('cuda')
     text_encoder.eval()
     text_encoder.requires_grad_(False)
     return text_tokenizer, text_encoder
     model_path='',
     scale_schedule=None,
     vae=None,
+    device=None,  # Make device optional
     model_kwargs=None,
     text_channels=2048,
     apply_spatial_patchify=0,
 ):
     print(f'[Loading Infinity]')
+    # Set device if not provided
+    if device is None:
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    print(f'Using device: {device}')
     # Set autocast dtype based on bf16 and device support
+    if bf16 and device == 'cuda' and torch.cuda.is_bf16_supported():
         autocast_dtype = torch.bfloat16
     else:
         autocast_dtype = torch.float32
     text_maxlen = 512
     torch.cuda.empty_cache()
+    with torch.amp.autocast(device_type=device, dtype=autocast_dtype), torch.no_grad():
         infinity_test: Infinity = Infinity(
             vae_local=vae, text_channels=text_channels, text_maxlen=text_maxlen,
             shared_aln=True, raw_scale_schedule=scale_schedule,
             inference_mode=True,
             train_h_div_w_list=[1.0],
             **model_kwargs,
+        ).to(device)
         print(f'[you selected Infinity with {model_kwargs=}] model size: {sum(p.numel() for p in infinity_test.parameters())/1e9:.2f}B, bf16={bf16}')
         infinity_test.requires_grad_(False)
         print(f'[Load Infinity weights]')
+        state_dict = torch.load(model_path, map_location=device)
         print(infinity_test.load_state_dict(state_dict))
+        # Initialize random number generator on the correct device
+        infinity_test.rng = torch.Generator(device=device)
         return infinity_test
     return gt_img, recons_img, all_bit_indices
 def load_visual_tokenizer(args):
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     # load vae
     if args.vae_type in [16,18,20,24,32,64]:
         from models.bsq_vae.vae import vae_model
                     if not osp.exists(local_model_path):
                         print(f'copy {model_path} to {local_model_path}')
                         shutil.copyfile(model_path, local_model_path)
+                    save_slim_model(local_model_path, save_file=local_slim_model_path, device=device)
                     print(f'copy {local_slim_model_path} to {slim_model_path}')
                     if not osp.exists(slim_model_path):
                         shutil.copyfile(local_slim_model_path, slim_model_path)
             slim_model_path = model_path
         print(f'load checkpoint from {slim_model_path}')
+    if args.model_type == 'infinity_2b':
+        kwargs_model = dict(depth=32, embed_dim=2048, num_heads=2048//128, drop_path_rate=0.1, mlp_ratio=4, block_chunks=8) # 2b model
+    elif args.model_type == 'infinity_layer12':
+        kwargs_model = dict(depth=12, embed_dim=768, num_heads=8, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer16':
+        kwargs_model = dict(depth=16, embed_dim=1152, num_heads=12, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer24':
+        kwargs_model = dict(depth=24, embed_dim=1536, num_heads=16, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer32':
+        kwargs_model = dict(depth=32, embed_dim=2080, num_heads=20, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer40':
+        kwargs_model = dict(depth=40, embed_dim=2688, num_heads=24, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
+    elif args.model_type == 'infinity_layer48':
+        kwargs_model = dict(depth=48, embed_dim=3360, num_heads=28, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
     infinity = load_infinity(
         rope2d_each_sa_layer=args.rope2d_each_sa_layer,
         rope2d_normalized_by_hw=args.rope2d_normalized_by_hw,
         model_path=slim_model_path,
         scale_schedule=None,
         vae=vae,
+        device=None,
+        model_kwargs=kwargs_model,
         text_channels=args.text_channels,
         apply_spatial_patchify=args.apply_spatial_patchify,
         use_flex_attn=args.use_flex_attn,
 weights_path.mkdir(exist_ok=True)
 download_infinity_weights(weights_path)
+# Device setup
+dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
+print(f"Using dtype: {dtype}")
 # Define args
 args = argparse.Namespace(
     pn='1M',
     cache_dir='/dev/shm',
     checkpoint_type='torch',
     seed=0,
+    bf16=1 if dtype == torch.bfloat16 else 0,
     save_file='tmp.jpg',
     enable_model_cache=False,
 )
 # Define the image generation function
 @spaces.GPU
+def generate_image(prompt, cfg, tau, h_div_w, seed):
     try:
+        args.prompt = prompt
+        args.cfg = cfg
+        args.tau = tau
+        args.h_div_w = h_div_w
+        args.seed = seed
+        # Find the closest h_div_w_template
+        h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w_templates - h_div_w))]
+        # Get scale_schedule based on h_div_w_template_
+        scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['scales']
+        scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
+        # Generate the image
+        generated_image = gen_one_img(
+            infinity,
+            vae,
+            text_tokenizer,
+            text_encoder,
+            prompt,
+            g_seed=seed,
+            gt_leak=0,
+            gt_ls_Bl=None,
+            cfg_list=cfg,
+            tau_list=tau,
+            scale_schedule=scale_schedule,
+            cfg_insertion_layer=[args.cfg_insertion_layer],
+            vae_type=args.vae_type,
+            sampling_per_bits=args.sampling_per_bits,
+        )
+        # Convert the image to RGB and uint8
+        image = generated_image.cpu().numpy()
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        image = np.uint8(image)
         return image
     except Exception as e:
 # Set up Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("<h1><center>Infinity Image Generator</center></h1>")
     with gr.Row():
         with gr.Column():