Spaces:

Greekmongoose
/

LTX-Video-Playground

Runtime error

App Files Files Community

benibraz commited on Nov 26, 2024

Commit

dcc21df

1 Parent(s): a040b19

change seed and analytics

Browse files

Files changed (1) hide show

app.py +160 -44

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import gradio as gr
 from gradio_toggle import Toggle
 import torch
 from huggingface_hub import snapshot_download
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.models.transformers.transformer3d import Transformer3DModel
@@ -20,6 +22,9 @@ import tempfile
 import os
 import gc
 from openai import OpenAI
 # Load Hugging Face token if needed
 hf_token = os.getenv("HF_TOKEN")
@@ -36,9 +41,7 @@ with open(system_prompt_i2v_path, "r") as f:
 # Set model download directory within Hugging Face Spaces
 model_path = "asset"
 if not os.path.exists(model_path):
-    snapshot_download(
-        "Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token
-    )
 # Global variables to load components
 vae_dir = Path(model_path) / "vae"
@@ -47,6 +50,94 @@ scheduler_dir = Path(model_path) / "scheduler"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
@@ -185,12 +276,8 @@ vae = load_vae(vae_dir)
 unet = load_unet(unet_dir)
 scheduler = load_scheduler(scheduler_dir)
 patchifier = SymmetricPatchifier(patch_size=1)
-text_encoder = T5EncoderModel.from_pretrained(
-    "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
-).to(device)
-tokenizer = T5Tokenizer.from_pretrained(
-    "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
-)
 pipeline = XoraVideoPipeline(
     transformer=unet,
@@ -205,9 +292,10 @@ pipeline = XoraVideoPipeline(
 def generate_video_from_text(
     prompt="",
     enhance_prompt_toggle=False,
     negative_prompt="",
     frame_rate=25,
-    seed=171198,
     num_inference_steps=30,
     guidance_scale=3,
     height=512,
@@ -221,6 +309,21 @@ def generate_video_from_text(
             duration=5,
         )
     prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
     sample = {
@@ -269,9 +372,7 @@ def generate_video_from_text(
     video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
     video_np = (video_np * 255).astype(np.uint8)
     height, width = video_np.shape[1:3]
-    out = cv2.VideoWriter(
-        output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
-    )
     for frame in video_np[..., ::-1]:
         out.write(frame)
     out.release()
@@ -286,9 +387,10 @@ def generate_video_from_image(
     image_path,
     prompt="",
     enhance_prompt_toggle=False,
     negative_prompt="",
     frame_rate=25,
-    seed=171198,
     num_inference_steps=30,
     guidance_scale=3,
     height=512,
@@ -310,9 +412,28 @@ def generate_video_from_image(
     if not image_path:
         raise gr.Error("Please provide an input image.", duration=5)
-    media_items = (
-        load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
-    )
     prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
@@ -353,9 +474,7 @@ def generate_video_from_image(
         video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
         video_np = (video_np * 255).astype(np.uint8)
         height, width = video_np.shape[1:3]
-        out = cv2.VideoWriter(
-            output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
-        )
         for frame in video_np[..., ::-1]:
             out.write(frame)
         out.release()
@@ -374,15 +493,9 @@ def generate_video_from_image(
 def create_advanced_options():
     with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
-        seed = gr.Slider(
-            label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=171198
-        )
-        inference_steps = gr.Slider(
-            label="4.2 Inference Steps", minimum=1, maximum=50, step=1, value=30
-        )
-        guidance_scale = gr.Slider(
-            label="4.3 Guidance Scale", minimum=1.0, maximum=5.0, step=0.1, value=3.0
-        )
         height_slider = gr.Slider(
             label="4.4 Height",
@@ -451,9 +564,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
         </div>
         """
         )
-    with gr.Accordion(
-        " 📖 Tips for Best Results", open=False, elem_id="instructions-accordion"
-    ):
         gr.Markdown(
             """
         📝 Prompt Engineering
@@ -491,6 +602,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                         value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
                     txt2vid_enhance_toggle = Toggle(
                         label="Enhance Prompt",
                         value=False,
@@ -566,6 +683,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                         value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
                     img2vid_enhance_toggle = Toggle(
                         label="Enhance Prompt",
                         value=False,
@@ -593,9 +715,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                     )
                     img2vid_advanced = create_advanced_options()
-                    img2vid_generate = gr.Button(
-                        "Step 6: Generate Video", variant="primary", size="lg"
-                    )
                 with gr.Column():
                     img2vid_output = gr.Video(label="Generated Output")
@@ -632,15 +752,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
                 )
     # [Previous event handlers remain the same]
-    txt2vid_preset.change(
-        fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
-    )
     txt2vid_generate.click(
         fn=generate_video_from_text,
         inputs=[
             txt2vid_prompt,
             txt2vid_enhance_toggle,
             txt2vid_negative_prompt,
             txt2vid_frame_rate,
             *txt2vid_advanced,
@@ -651,9 +770,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
         queue=True,
     )
-    img2vid_preset.change(
-        fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[3:]
-    )
     img2vid_generate.click(
         fn=generate_video_from_image,
@@ -661,6 +778,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
             img2vid_image,
             img2vid_prompt,
             img2vid_enhance_toggle,
             img2vid_negative_prompt,
             img2vid_frame_rate,
             *img2vid_advanced,
@@ -672,6 +790,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
     )
 if __name__ == "__main__":
-    iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
-        share=True, show_api=False
-    )

+from functools import lru_cache
 import gradio as gr
 from gradio_toggle import Toggle
 import torch
 from huggingface_hub import snapshot_download
+from transformers import CLIPProcessor, CLIPModel
 from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
 from xora.models.transformers.transformer3d import Transformer3DModel
 import os
 import gc
 from openai import OpenAI
+import csv
+from datetime import datetime
 # Load Hugging Face token if needed
 hf_token = os.getenv("HF_TOKEN")
 # Set model download directory within Hugging Face Spaces
 model_path = "asset"
 if not os.path.exists(model_path):
+    snapshot_download("Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token)
 # Global variables to load components
 vae_dir = Path(model_path) / "vae"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+DATA_DIR = "/data"
+os.makedirs(DATA_DIR, exist_ok=True)
+LOG_FILE_PATH = os.path.join("/data", "user_requests.csv")
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", cache_dir=model_path)
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir=model_path)
+if not os.path.exists(LOG_FILE_PATH):
+    with open(LOG_FILE_PATH, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(
+            [
+                "timestamp",
+                "request_type",
+                "prompt",
+                "negative_prompt",
+                "height",
+                "width",
+                "num_frames",
+                "frame_rate",
+                "seed",
+                "num_inference_steps",
+                "guidance_scale",
+                "is_enhanced",
+                "clip_embedding",
+                "original_resolution",
+            ]
+        )
+@lru_cache(maxsize=128)
+def log_request(
+    request_type,
+    prompt,
+    negative_prompt,
+    height,
+    width,
+    num_frames,
+    frame_rate,
+    seed,
+    num_inference_steps,
+    guidance_scale,
+    is_enhanced,
+    clip_embedding=None,
+    original_resolution=None,
+):
+    """Log the user's request to a CSV file."""
+    timestamp = datetime.now().isoformat()
+    with open(LOG_FILE_PATH, "a", newline="") as f:
+        try:
+            writer = csv.writer(f)
+            writer.writerow(
+                [
+                    timestamp,
+                    request_type,
+                    prompt,
+                    negative_prompt,
+                    height,
+                    width,
+                    num_frames,
+                    frame_rate,
+                    seed,
+                    num_inference_steps,
+                    guidance_scale,
+                    is_enhanced,
+                    clip_embedding,
+                    original_resolution,
+                ]
+            )
+        except Exception as e:
+            print(f"Error logging request: {e}")
+def compute_clip_embedding(text=None, image=None):
+    """
+    Compute CLIP embedding for a given text or image.
+    Args:
+        text (str): Input text prompt.
+        image (PIL.Image): Input image.
+    Returns:
+        list: CLIP embedding as a list of floats.
+    """
+    inputs = clip_processor(text=text, images=image, return_tensors="pt", padding=True)
+    outputs = clip_model.get_text_features(**inputs) if text else clip_model.get_image_features(**inputs)
+    embedding = outputs.detach().cpu().numpy().flatten().tolist()
+    return embedding
 def load_vae(vae_dir):
     vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
 unet = load_unet(unet_dir)
 scheduler = load_scheduler(scheduler_dir)
 patchifier = SymmetricPatchifier(patch_size=1)
+text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to(device)
+tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
 pipeline = XoraVideoPipeline(
     transformer=unet,
 def generate_video_from_text(
     prompt="",
     enhance_prompt_toggle=False,
+    txt2vid_analytics_toggle=True,
     negative_prompt="",
     frame_rate=25,
+    seed=646373,
     num_inference_steps=30,
     guidance_scale=3,
     height=512,
             duration=5,
         )
+    if txt2vid_analytics_toggle:
+        log_request(
+            "txt2vid",
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            num_frames,
+            frame_rate,
+            seed,
+            num_inference_steps,
+            guidance_scale,
+            enhance_prompt_toggle,
+        )
     prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
     sample = {
     video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
     video_np = (video_np * 255).astype(np.uint8)
     height, width = video_np.shape[1:3]
+    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height))
     for frame in video_np[..., ::-1]:
         out.write(frame)
     out.release()
     image_path,
     prompt="",
     enhance_prompt_toggle=False,
+    img2vid_analytics_toggle=True,
     negative_prompt="",
     frame_rate=25,
+    seed=646373,
     num_inference_steps=30,
     guidance_scale=3,
     height=512,
     if not image_path:
         raise gr.Error("Please provide an input image.", duration=5)
+    if img2vid_analytics_toggle:
+        with Image.open(image_path) as img:
+            original_resolution = f"{img.width}x{img.height}"  # Format as "widthxheight"
+            clip_embedding = compute_clip_embedding(image=img)
+        log_request(
+            "img2vid",
+            prompt,
+            negative_prompt,
+            height,
+            width,
+            num_frames,
+            frame_rate,
+            seed,
+            num_inference_steps,
+            guidance_scale,
+            enhance_prompt_toggle,
+            json.dumps(clip_embedding),
+            original_resolution,
+        )
+    media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
     prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
         video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
         video_np = (video_np * 255).astype(np.uint8)
         height, width = video_np.shape[1:3]
+        out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height))
         for frame in video_np[..., ::-1]:
             out.write(frame)
         out.release()
 def create_advanced_options():
     with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
+        seed = gr.Slider(label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=646373)
+        inference_steps = gr.Slider(label="4.2 Inference Steps", minimum=1, maximum=50, step=1, value=30)
+        guidance_scale = gr.Slider(label="4.3 Guidance Scale", minimum=1.0, maximum=5.0, step=0.1, value=3.0)
         height_slider = gr.Slider(
             label="4.4 Height",
         </div>
         """
         )
+    with gr.Accordion(" 📖 Tips for Best Results", open=False, elem_id="instructions-accordion"):
         gr.Markdown(
             """
         📝 Prompt Engineering
                         value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
+                    txt2vid_analytics_toggle = Toggle(
+                        label="I agree to share my usage data anonymously to help improve the model features.",
+                        value=True,
+                        interactive=True,
+                    )
                     txt2vid_enhance_toggle = Toggle(
                         label="Enhance Prompt",
                         value=False,
                         value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
                         lines=5,
                     )
+                    img2vid_analytics_toggle = Toggle(
+                        label="I agree to share my usage data anonymously to help improve the model features.",
+                        value=True,
+                        interactive=True,
+                    )
                     img2vid_enhance_toggle = Toggle(
                         label="Enhance Prompt",
                         value=False,
                     )
                     img2vid_advanced = create_advanced_options()
+                    img2vid_generate = gr.Button("Step 6: Generate Video", variant="primary", size="lg")
                 with gr.Column():
                     img2vid_output = gr.Video(label="Generated Output")
                 )
     # [Previous event handlers remain the same]
+    txt2vid_preset.change(fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:])
     txt2vid_generate.click(
         fn=generate_video_from_text,
         inputs=[
             txt2vid_prompt,
             txt2vid_enhance_toggle,
+            txt2vid_analytics_toggle,
             txt2vid_negative_prompt,
             txt2vid_frame_rate,
             *txt2vid_advanced,
         queue=True,
     )
+    img2vid_preset.change(fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[3:])
     img2vid_generate.click(
         fn=generate_video_from_image,
             img2vid_image,
             img2vid_prompt,
             img2vid_enhance_toggle,
+            img2vid_analytics_toggle,
             img2vid_negative_prompt,
             img2vid_frame_rate,
             *img2vid_advanced,
     )
 if __name__ == "__main__":
+    iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(share=True, show_api=False)