jbilcke-hf
/

LTX-Video-0.9.1-HFIE

@@ -70,7 +70,7 @@ apply_dirty_hack_to_patch_file_extensions_and_bypass_filter("/repository")
 #print_directory_structure("/repository")
-def process_input_image(image_data: str, target_width: int, target_height: int) -> Image.Image:
     """
     Process input image from base64, resize and crop to target dimensions
@@ -78,6 +78,7 @@ def process_input_image(image_data: str, target_width: int, target_height: int)
         image_data: Base64 encoded image data
         target_width: Desired width
         target_height: Desired height
     Returns:
         Processed PIL Image
@@ -127,6 +128,15 @@ def process_input_image(image_data: str, target_width: int, target_height: int)
         bottom = top + target_height
         image = image.crop((left, top, right, bottom))
         return image
@@ -146,6 +156,10 @@ class GenerationConfig:
     width: int = 768
     height: int = 416
     # users may tend to always set this to the max, to get as much useable content as possible (which is MAX_FRAMES ie. 257).
     # The value must be a multiple of 8, plus 1 frame.
     # visual glitches appear after about 169 frames, so we don't need more actually
@@ -319,6 +333,7 @@ class EndpointHandler:
                     - negative_prompt (optional, string): list of concepts to ignore in the video.
                     - width (optional, int, default to 768): width, or horizontal size in pixels.
                     - height (optional, int, default to 512): height, or vertical size in pixels.
                     - num_frames (optional, int, default to 129): the numer of frames must be a multiple of 8, plus 1 frame.
                     - guidance_scale (optional, float, default to 3.5): Guidance scale (values between 3.0 and 4.0 are nice)
                     - num_inference_steps (optional, int, default to 50): number of inference steps
@@ -362,6 +377,7 @@ class EndpointHandler:
             # video model settings (will be used during generation of the initial raw video clip)
             width=params.get("width", GenerationConfig.width),
             height=params.get("height", GenerationConfig.height),
             num_frames=params.get("num_frames", GenerationConfig.num_frames),
             guidance_scale=params.get("guidance_scale", GenerationConfig.guidance_scale),
             num_inference_steps=params.get("num_inference_steps", GenerationConfig.num_inference_steps),
@@ -422,7 +438,8 @@ class EndpointHandler:
                     processed_image = process_input_image(
                         input_image,
                         config.width,
-                        config.height
                     )
                     generation_kwargs["image"] = processed_image
                     frames = self.image_to_video(**generation_kwargs).frames

 #print_directory_structure("/repository")
+def process_input_image(image_data: str, target_width: int, target_height: int, input_image_quality: int) -> Image.Image:
     """
     Process input image from base64, resize and crop to target dimensions
         image_data: Base64 encoded image data
         target_width: Desired width
         target_height: Desired height
+        fake_video_input: bool
     Returns:
         Processed PIL Image
         bottom = top + target_height
         image = image.crop((left, top, right, bottom))
+        # Apply JPEG compression if input_image_quality is not 100
+        if input_image_quality < 100:
+            # Save with compression to bytes buffer
+            buffer = io.BytesIO()
+            image.save(buffer, format='JPEG', quality=input_image_quality)
+            buffer.seek(0)
+            # Load compressed image back
+            image = Image.open(buffer)
         return image
     width: int = 768
     height: int = 416
+    # this is a trick we use to convert a "pristine" image into a "dirty" video frame
+    # this helps fooling LTX-Video into turning the image into an animated one
+    input_image_quality: int = 100
     # users may tend to always set this to the max, to get as much useable content as possible (which is MAX_FRAMES ie. 257).
     # The value must be a multiple of 8, plus 1 frame.
     # visual glitches appear after about 169 frames, so we don't need more actually
                     - negative_prompt (optional, string): list of concepts to ignore in the video.
                     - width (optional, int, default to 768): width, or horizontal size in pixels.
                     - height (optional, int, default to 512): height, or vertical size in pixels.
+                    - input_image_quality (optional, int, default to 100): this is a trick we use to convert a "pristine" image into a "dirty" video frame. This helps fooling LTX-Video into turning the image into an animated one.
                     - num_frames (optional, int, default to 129): the numer of frames must be a multiple of 8, plus 1 frame.
                     - guidance_scale (optional, float, default to 3.5): Guidance scale (values between 3.0 and 4.0 are nice)
                     - num_inference_steps (optional, int, default to 50): number of inference steps
             # video model settings (will be used during generation of the initial raw video clip)
             width=params.get("width", GenerationConfig.width),
             height=params.get("height", GenerationConfig.height),
+            input_image_quality=params.get("input_image_quality", GenerationConfig.input_image_quality),
             num_frames=params.get("num_frames", GenerationConfig.num_frames),
             guidance_scale=params.get("guidance_scale", GenerationConfig.guidance_scale),
             num_inference_steps=params.get("num_inference_steps", GenerationConfig.num_inference_steps),
                     processed_image = process_input_image(
                         input_image,
                         config.width,
+                        config.height,
+                        config.convert_input_image_into_fake_video_frame,
                     )
                     generation_kwargs["image"] = processed_image
                     frames = self.image_to_video(**generation_kwargs).frames