Spaces:

MohamedRashad
/

Infinity

Runtime error

App Files Files Community

MohamedRashad commited on Jan 6

Commit

0e3e704

1 Parent(s): 1776f2c

Refactoring

Browse files

Files changed (1) hide show

app.py +112 -135

app.py CHANGED Viewed

@@ -8,19 +8,16 @@ import os.path as osp
 import time
 import hashlib
 import argparse
-import shutil
-import re
 import random
 from pathlib import Path
-from typing import List
-import json
 import cv2
 import numpy as np
 import torch
 import torch.nn.functional as F
 from PIL import Image, ImageEnhance
-import PIL.Image as PImage
 from torchvision.transforms.functional import to_tensor
 from transformers import AutoTokenizer, T5EncoderModel, T5TokenizerFast
 from huggingface_hub import hf_hub_download
@@ -29,12 +26,54 @@ import spaces
 from models.infinity import Infinity
 from models.basic import *
-from utils.dynamic_resolution import dynamic_resolution_h_w, h_div_w_templates
 from gradio_client import Client
 torch._dynamo.config.cache_size_limit = 64
 client = Client("Qwen/Qwen2.5-72B-Instruct")
 # Define a function to download weights if not present
 def download_infinity_weights(weights_path):
     try:
@@ -96,60 +135,6 @@ def enhance_image(image):
         color_image = color_enhancer.enhance(1.05)  # 增强饱和度
     return color_image
-def gen_one_img(
-    infinity_test,
-    vae,
-    text_tokenizer,
-    text_encoder,
-    prompt,
-    cfg_list=[],
-    tau_list=[],
-    negative_prompt='',
-    scale_schedule=None,
-    top_k=900,
-    top_p=0.97,
-    cfg_sc=3,
-    cfg_exp_k=0.0,
-    cfg_insertion_layer=-5,
-    vae_type=0,
-    gumbel=0,
-    softmax_merge_topk=-1,
-    gt_leak=-1,
-    gt_ls_Bl=None,
-    g_seed=None,
-    sampling_per_bits=1,
-    enable_positive_prompt=0,
-):
-    sstt = time.time()
-    if not isinstance(cfg_list, list):
-        cfg_list = [cfg_list] * len(scale_schedule)
-    if not isinstance(tau_list, list):
-        tau_list = [tau_list] * len(scale_schedule)
-    text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt)
-    if negative_prompt:
-        negative_label_B_or_BLT = encode_prompt(text_tokenizer, text_encoder, negative_prompt)
-    else:
-        negative_label_B_or_BLT = None
-    print(f'cfg: {cfg_list}, tau: {tau_list}')
-    with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16, cache_enabled=True):
-        stt = time.time()
-        _, _, img_list = infinity_test.autoregressive_infer_cfg(
-            vae=vae,
-            scale_schedule=scale_schedule,
-            label_B_or_BLT=text_cond_tuple, g_seed=g_seed,
-            B=1, negative_label_B_or_BLT=negative_label_B_or_BLT, force_gt_Bhw=None,
-            cfg_sc=cfg_sc, cfg_list=cfg_list, tau_list=tau_list, top_k=top_k, top_p=top_p,
-            returns_vemb=1, ratio_Bl1=None, gumbel=gumbel, norm_cfg=False,
-            cfg_exp_k=cfg_exp_k, cfg_insertion_layer=cfg_insertion_layer,
-            vae_type=vae_type, softmax_merge_topk=softmax_merge_topk,
-            ret_img=True, trunk_scale=1000,
-            gt_leak=gt_leak, gt_ls_Bl=gt_ls_Bl, inference_mode=True,
-            sampling_per_bits=sampling_per_bits,
-        )
-    print(f"cost: {time.time() - sstt}, infinity cost={time.time() - stt}")
-    img = img_list[0]
-    return img
 def get_prompt_id(prompt):
     md5 = hashlib.md5()
     md5.update(prompt.encode('utf-8'))
@@ -173,7 +158,7 @@ def load_tokenizer(t5_path =''):
     text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(t5_path, revision=None, legacy=True)
     text_tokenizer.model_max_length = 512
     text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
-    text_encoder.to('cuda')
     text_encoder.eval()
     text_encoder.requires_grad_(False)
     return text_tokenizer, text_encoder
@@ -188,7 +173,6 @@ def load_infinity(
     model_path='',
     scale_schedule=None,
     vae=None,
-    device=None,  # Make device optional
     model_kwargs=None,
     text_channels=2048,
     apply_spatial_patchify=0,
@@ -197,13 +181,8 @@ def load_infinity(
 ):
     print(f'[Loading Infinity]')
-    # Set device if not provided
-    if device is None:
-        device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    print(f'Using device: {device}')
     # Set autocast dtype based on bf16 and device support
-    if bf16 and device == 'cuda' and torch.cuda.is_bf16_supported():
         autocast_dtype = torch.bfloat16
     else:
         autocast_dtype = torch.float32
@@ -212,7 +191,7 @@ def load_infinity(
     text_maxlen = 512
     torch.cuda.empty_cache()
-    with torch.amp.autocast(device_type=device, dtype=autocast_dtype), torch.no_grad():
         infinity_test: Infinity = Infinity(
             vae_local=vae, text_channels=text_channels, text_maxlen=text_maxlen,
             shared_aln=True, raw_scale_schedule=scale_schedule,
@@ -230,7 +209,7 @@ def load_infinity(
             inference_mode=True,
             train_h_div_w_list=[1.0],
             **model_kwargs,
-        ).to(device)
         print(f'[you selected Infinity with {model_kwargs=}] model size: {sum(p.numel() for p in infinity_test.parameters())/1e9:.2f}B, bf16={bf16}')
@@ -242,17 +221,11 @@ def load_infinity(
         infinity_test.requires_grad_(False)
         print(f'[Load Infinity weights]')
-        state_dict = torch.load(model_path, map_location=device)
         print(infinity_test.load_state_dict(state_dict))
-        # Initialize random number generator, falling back to CPU if CUDA is not available
-        try:
-            infinity_test.rng = torch.Generator(device=device)
-        except RuntimeError:
-            print("CUDA device not available. Falling back to CPU...")
-            device = 'cpu'
-            infinity_test = infinity_test.to(device)
-            infinity_test.rng = torch.Generator(device=device)
         return infinity_test
@@ -294,7 +267,7 @@ def joint_vi_vae_encode_decode(vae, image_path, scale_schedule, device, tgt_h, t
     return gt_img, recons_img, all_bit_indices
 def load_visual_tokenizer(args):
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     # load vae
     if args.vae_type in [16,18,20,24,32,64]:
         from models.bsq_vae.vae import vae_model
@@ -337,7 +310,7 @@ def load_transformer(vae, args):
                     if not osp.exists(local_model_path):
                         print(f'copy {model_path} to {local_model_path}')
                         shutil.copyfile(model_path, local_model_path)
-                    save_slim_model(local_model_path, save_file=local_slim_model_path, device=device)
                     print(f'copy {local_slim_model_path} to {slim_model_path}')
                     if not osp.exists(slim_model_path):
                         shutil.copyfile(local_slim_model_path, slim_model_path)
@@ -348,20 +321,7 @@ def load_transformer(vae, args):
             slim_model_path = model_path
         print(f'load checkpoint from {slim_model_path}')
-    if args.model_type == 'infinity_2b':
-        kwargs_model = dict(depth=32, embed_dim=2048, num_heads=2048//128, drop_path_rate=0.1, mlp_ratio=4, block_chunks=8) # 2b model
-    elif args.model_type == 'infinity_layer12':
-        kwargs_model = dict(depth=12, embed_dim=768, num_heads=8, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer16':
-        kwargs_model = dict(depth=16, embed_dim=1152, num_heads=12, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer24':
-        kwargs_model = dict(depth=24, embed_dim=1536, num_heads=16, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer32':
-        kwargs_model = dict(depth=32, embed_dim=2080, num_heads=20, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer40':
-        kwargs_model = dict(depth=40, embed_dim=2688, num_heads=24, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
-    elif args.model_type == 'infinity_layer48':
-        kwargs_model = dict(depth=48, embed_dim=3360, num_heads=28, drop_path_rate=0.1, mlp_ratio=4, block_chunks=4)
     infinity = load_infinity(
         rope2d_each_sa_layer=args.rope2d_each_sa_layer,
         rope2d_normalized_by_hw=args.rope2d_normalized_by_hw,
@@ -372,8 +332,7 @@ def load_transformer(vae, args):
         model_path=slim_model_path,
         scale_schedule=None,
         vae=vae,
-        device=None,
-        model_kwargs=kwargs_model,
         text_channels=args.text_channels,
         apply_spatial_patchify=args.apply_spatial_patchify,
         use_flex_attn=args.use_flex_attn,
@@ -440,10 +399,6 @@ weights_path = Path(__file__).parent / 'weights'
 weights_path.mkdir(exist_ok=True)
 download_infinity_weights(weights_path)
-# Device setup
-dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32
-print(f"Using dtype: {dtype}")
 # Define args
 args = argparse.Namespace(
     pn='1M',
@@ -465,7 +420,7 @@ args = argparse.Namespace(
     cache_dir='/dev/shm',
     checkpoint_type='torch',
     seed=0,
-    bf16=1 if dtype == torch.bfloat16 else 0,
     save_file='tmp.jpg',
     enable_model_cache=False,
 )
@@ -478,44 +433,61 @@ infinity = load_transformer(vae, args)
 # Define the image generation function
 @spaces.GPU
 def generate_image(prompt, cfg, tau, h_div_w, seed, enable_positive_prompt=False):
     try:
-        args.prompt = prompt
-        args.cfg = cfg
-        args.tau = tau
-        args.h_div_w = h_div_w
-        args.seed = seed
-        args.enable_positive_prompt = enable_positive_prompt
-        # Find the closest h_div_w_template
-        h_div_w_template_ = h_div_w_templates[np.argmin(np.abs(h_div_w_templates - h_div_w))]
-        # Get scale_schedule based on h_div_w_template_
-        scale_schedule = dynamic_resolution_h_w[h_div_w_template_][args.pn]['scales']
-        scale_schedule = [(1, h, w) for (_, h, w) in scale_schedule]
-        # Generate the image
-        generated_image = gen_one_img(
-            infinity,
-            vae,
-            text_tokenizer,
-            text_encoder,
-            prompt,
-            g_seed=seed,
-            gt_leak=0,
-            gt_ls_Bl=None,
-            cfg_list=cfg,
-            tau_list=tau,
-            scale_schedule=scale_schedule,
-            cfg_insertion_layer=[args.cfg_insertion_layer],
-            vae_type=args.vae_type,
-            sampling_per_bits=args.sampling_per_bits,
-            enable_positive_prompt=enable_positive_prompt,
-        )
-        # Convert the image to RGB and uint8
-        image = generated_image.cpu().numpy()
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        image = np.uint8(image)
         return image
     except Exception as e:
@@ -525,6 +497,11 @@ def generate_image(prompt, cfg, tau, h_div_w, seed, enable_positive_prompt=False
 # Set up Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("<h1><center>Infinity Image Generator</center></h1>")
     with gr.Row():
         with gr.Column():

 import time
 import hashlib
 import argparse
 import random
 from pathlib import Path
+from typing import List, Dict, Optional
+from dataclasses import dataclass
 import cv2
 import numpy as np
 import torch
 import torch.nn.functional as F
 from PIL import Image, ImageEnhance
 from torchvision.transforms.functional import to_tensor
 from transformers import AutoTokenizer, T5EncoderModel, T5TokenizerFast
 from huggingface_hub import hf_hub_download
 from models.infinity import Infinity
 from models.basic import *
+from utils.dynamic_resolution import dynamic_resolution_h_w
 from gradio_client import Client
+# Performance optimizations
 torch._dynamo.config.cache_size_limit = 64
+torch.backends.cudnn.benchmark = True  # Enable cudnn auto-tuner
 client = Client("Qwen/Qwen2.5-72B-Instruct")
+@dataclass
+class ModelConfig:
+    """Configuration for Infinity model."""
+    depth: int
+    embed_dim: int
+    num_heads: int
+    drop_path_rate: float = 0.1
+    mlp_ratio: float = 4.0
+    block_chunks: int = 8
+    @classmethod
+    def from_type(cls, model_type: str) -> 'ModelConfig':
+        """Create model config from predefined types."""
+        configs = {
+            'infinity_2b': dict(depth=32, embed_dim=2048, num_heads=2048//128),
+            'infinity_layer12': dict(depth=12, embed_dim=768, num_heads=8),
+            'infinity_layer16': dict(depth=16, embed_dim=1152, num_heads=12),
+            'infinity_layer24': dict(depth=24, embed_dim=1536, num_heads=16),
+            'infinity_layer32': dict(depth=32, embed_dim=2080, num_heads=20),
+            'infinity_layer40': dict(depth=40, embed_dim=2688, num_heads=24),
+            'infinity_layer48': dict(depth=48, embed_dim=3360, num_heads=28),
+        }
+        if model_type not in configs:
+            raise ValueError(f"Unknown model type: {model_type}")
+        return cls(**configs[model_type])
+    def to_dict(self) -> Dict:
+        """Convert config to dictionary."""
+        return {
+            'depth': self.depth,
+            'embed_dim': self.embed_dim,
+            'num_heads': self.num_heads,
+            'drop_path_rate': self.drop_path_rate,
+            'mlp_ratio': self.mlp_ratio,
+            'block_chunks': self.block_chunks
+        }
+# Global device configuration
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 # Define a function to download weights if not present
 def download_infinity_weights(weights_path):
     try:
         color_image = color_enhancer.enhance(1.05)  # 增强饱和度
     return color_image
 def get_prompt_id(prompt):
     md5 = hashlib.md5()
     md5.update(prompt.encode('utf-8'))
     text_tokenizer: T5TokenizerFast = AutoTokenizer.from_pretrained(t5_path, revision=None, legacy=True)
     text_tokenizer.model_max_length = 512
     text_encoder: T5EncoderModel = T5EncoderModel.from_pretrained(t5_path, torch_dtype=torch.float16)
+    text_encoder.to(DEVICE)
     text_encoder.eval()
     text_encoder.requires_grad_(False)
     return text_tokenizer, text_encoder
     model_path='',
     scale_schedule=None,
     vae=None,
     model_kwargs=None,
     text_channels=2048,
     apply_spatial_patchify=0,
 ):
     print(f'[Loading Infinity]')
     # Set autocast dtype based on bf16 and device support
+    if bf16 and DEVICE.type == 'cuda' and torch.cuda.is_bf16_supported():
         autocast_dtype = torch.bfloat16
     else:
         autocast_dtype = torch.float32
     text_maxlen = 512
     torch.cuda.empty_cache()
+    with torch.amp.autocast(device_type=DEVICE.type, dtype=autocast_dtype), torch.no_grad():
         infinity_test: Infinity = Infinity(
             vae_local=vae, text_channels=text_channels, text_maxlen=text_maxlen,
             shared_aln=True, raw_scale_schedule=scale_schedule,
             inference_mode=True,
             train_h_div_w_list=[1.0],
             **model_kwargs,
+        ).to(DEVICE)
         print(f'[you selected Infinity with {model_kwargs=}] model size: {sum(p.numel() for p in infinity_test.parameters())/1e9:.2f}B, bf16={bf16}')
         infinity_test.requires_grad_(False)
         print(f'[Load Infinity weights]')
+        state_dict = torch.load(model_path, map_location=DEVICE)
         print(infinity_test.load_state_dict(state_dict))
+        # Initialize random number generator
+        infinity_test.rng = torch.Generator(device=DEVICE)
         return infinity_test
     return gt_img, recons_img, all_bit_indices
 def load_visual_tokenizer(args):
+    device = DEVICE
     # load vae
     if args.vae_type in [16,18,20,24,32,64]:
         from models.bsq_vae.vae import vae_model
                     if not osp.exists(local_model_path):
                         print(f'copy {model_path} to {local_model_path}')
                         shutil.copyfile(model_path, local_model_path)
+                    save_slim_model(local_model_path, save_file=local_slim_model_path, device=DEVICE)
                     print(f'copy {local_slim_model_path} to {slim_model_path}')
                     if not osp.exists(slim_model_path):
                         shutil.copyfile(local_slim_model_path, slim_model_path)
             slim_model_path = model_path
         print(f'load checkpoint from {slim_model_path}')
+    model_config = ModelConfig.from_type(args.model_type)
     infinity = load_infinity(
         rope2d_each_sa_layer=args.rope2d_each_sa_layer,
         rope2d_normalized_by_hw=args.rope2d_normalized_by_hw,
         model_path=slim_model_path,
         scale_schedule=None,
         vae=vae,
+        model_kwargs=model_config.to_dict(),
         text_channels=args.text_channels,
         apply_spatial_patchify=args.apply_spatial_patchify,
         use_flex_attn=args.use_flex_attn,
 weights_path.mkdir(exist_ok=True)
 download_infinity_weights(weights_path)
 # Define args
 args = argparse.Namespace(
     pn='1M',
     cache_dir='/dev/shm',
     checkpoint_type='torch',
     seed=0,
+    bf16=1 if torch.bfloat16 == torch.get_default_dtype() else 0,
     save_file='tmp.jpg',
     enable_model_cache=False,
 )
 # Define the image generation function
 @spaces.GPU
 def generate_image(prompt, cfg, tau, h_div_w, seed, enable_positive_prompt=False):
+    """Generate an image from a prompt with integrated generation logic."""
     try:
+        # Set random seed for reproducibility
+        if seed is not None:
+            torch.manual_seed(seed)
+            random.seed(seed)
+            np.random.seed(seed)
+        # Calculate image dimensions
+        tgt_h, tgt_w = dynamic_resolution_h_w(h_div_w)
+        scale_schedule = None
+        # Process text prompt
+        text_cond_tuple = encode_prompt(text_tokenizer, text_encoder, prompt, enable_positive_prompt)
+        # Set up negative prompt if needed
+        negative_prompt = ''
+        if negative_prompt:
+            negative_cond_tuple = encode_prompt(text_tokenizer, text_encoder, negative_prompt)
+            negative_label_B_or_BLT = negative_cond_tuple[0]
+        else:
+            negative_label_B_or_BLT = None
+        print(f'cfg: {cfg}, tau: {tau}')
+        # Generate image with automatic mixed precision
+        with torch.amp.autocast(device_type=DEVICE.type, dtype=torch.bfloat16):
+            stt = time.time()
+            _, _, img_list = infinity.autoregressive_infer_cfg(
+                vae=vae,
+                text_cond_tuple=text_cond_tuple,
+                negative_label_B_or_BLT=negative_label_B_or_BLT,
+                cfg_list=[cfg],
+                tau_list=[tau],
+                top_k=900,
+                top_p=0.97,
+                cfg_sc=3,
+                cfg_exp_k=0.0,
+                cfg_insertion_layer=[args.cfg_insertion_layer],
+                vae_type=args.vae_type,
+                gumbel=0,
+                softmax_merge_topk=-1,
+                gt_leak=0,
+                gt_ls_Bl=None,
+                g_seed=seed,
+                sampling_per_bits=args.sampling_per_bits,
+                scale_schedule=scale_schedule,
+            )
+            print(f'inference time: {time.time()-stt:.3f}s')
+        # Convert the image efficiently
+        with torch.no_grad():
+            image = img_list[0].cpu().numpy()
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            image = np.uint8(image)
         return image
     except Exception as e:
 # Set up Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown("<h1><center>Infinity Image Generator</center></h1>")
+    gr.Markdown("### Instructions")
+    gr.Markdown("1. Enter a prompt in the **Prompt Settings** section.")
+    gr.Markdown("2. Click the **Enhance Prompt** button to generate a more creative and detailed prompt.")
+    gr.Markdown("3. Adjust the **Image Settings** as desired.")
+    gr.Markdown("4. Click the **Generate Image** button to generate the image on the right.")
     with gr.Row():
         with gr.Column():