Spaces:
Paused
Paused
File size: 8,759 Bytes
683afc3 c1497a6 0737dc8 a4cc7b2 58c4ba9 38e6a4b 4fbc46c c1497a6 38e6a4b 683afc3 38e6a4b b12bc82 58c4ba9 a4cc7b2 58c4ba9 a4cc7b2 58c4ba9 a4cc7b2 38e6a4b a4cc7b2 58c4ba9 a4cc7b2 58c4ba9 38e6a4b 2fd610d 38e6a4b bcbf6e0 c3fd50e 0737dc8 74c4e79 38e6a4b 2fd610d 38e6a4b 1103202 38e6a4b 2fd610d 97c3973 38e6a4b 2fd610d 58c4ba9 38e6a4b 1103202 58c4ba9 97c3973 a4cc7b2 683afc3 2fd610d c3fd50e 2fd610d c3fd50e 2fd610d c3fd50e 7968596 a4cc7b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import gradio as gr
from huggingface_hub import login
import os
import spaces,tempfile
import torch
from diffusers import AnimateDiffSparseControlNetPipeline
from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
from diffusers.schedulers import DPMSolverMultistepScheduler
from diffusers.utils import export_to_gif, load_image
from diffusers import AutoPipelineForText2Image
import openai,json
token = os.getenv("HF_TOKEN")
login(token=token)
openai_token = os.getenv("OPENAI_TOKEN")
openai.api_key = openai_token
openaiclient = openai.OpenAI(api_key=openai.api_key)
def ask_gpt(massage_history,model="gpt-4o-mini",return_str=True,response_format={"type": "json_object"}):
response = openaiclient.chat.completions.create(
model=model,
messages=massage_history,
response_format=response_format,
max_tokens=4000, )
if return_str:
return response.choices[0].message.content
else:
return json.loads(response.choices[0].message.content)
image_pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16).to("cuda")
image_pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
@spaces.GPU
def generate_image(prompt, reference_image, controlnet_conditioning_scale):
style_images = [load_image(f.name) for f in reference_image]
image_pipeline.set_ip_adapter_scale(controlnet_conditioning_scale)
image = image_pipeline(
prompt=prompt,
ip_adapter_image=[style_images],
negative_prompt="",
guidance_scale=5,
num_inference_steps=30,
).images[0]
return image
model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
controlnet_id = "guoyww/animatediff-sparsectrl-rgb"
lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
vae_id = "stabilityai/sd-vae-ft-mse"
device = "cuda"
motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
scheduler = DPMSolverMultistepScheduler.from_pretrained(
model_id,
subfolder="scheduler",
beta_schedule="linear",
algorithm_type="dpmsolver++",
use_karras_sigmas=True,
)
gif_pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
model_id,
motion_adapter=motion_adapter,
controlnet=controlnet,
vae=vae,
scheduler=scheduler,
torch_dtype=torch.float16,
).to(device)
gif_pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
gif_pipe.enable_free_noise(context_length=16, context_stride=4)
@spaces.GPU
def generate_frames(prompt, reference_image, controlnet_conditioning_scale,num_frames):
massage_history = [{"role": "system", "content": """
You are a scene designer tasked with creating sparse frames of a video. You will be given a prompt describing the desired video, and your goal is to design only the key frames (sparse frames) that represent major changes in the scene. Do not include repetitive or similar scenes—only capture distinct moments.
Expected Format:
Return the response as a JSON object with the key "frames". The value should be a list of dictionaries, where each dictionary has:
"frame_index": an integer indicating the frame's position in the sequence.
"description": a brief description of the scene in this frame.
Example:
If given a prompt like "A sunset over a beach with waves crashing and a ship sailing by," your response should look like this:
```json
{
"frames": [
{"frame_index": 0, "description": "Sunset over an empty beach, sky turning orange and pink"},
{"frame_index": 30, "description": "Waves gently crashing on the shore"},
{"frame_index": 60, "description": "A ship appears on the horizon, silhouetted by the sunset"},
{"frame_index": 90, "description": "Ship sailing closer, with waves becoming more dynamic"},
{"frame_index": 120, "description": "Sun dipping below the horizon, casting a golden glow over the water"}
]
}
```
This way, each frame represents a distinct scene, and there’s no redundancy between them."""},
{"role": "user", "content": f"give me the frames to generate a video with prompt : `{prompt}`"},]
frames = ask_gpt(massage_history,return_str=False)['frames']
conditioning_frames = []
controlnet_frame_indices =[]
for frame in frames:
conditioning_frames.append(generate_image(frame['description'], reference_image, controlnet_conditioning_scale))
controlnet_frame_indices.append(frame['frame_index'])
return conditioning_frames
@spaces.GPU
def generate_gif(prompt, reference_image, controlnet_conditioning_scale,num_frames):
massage_history = [{"role": "system", "content": """
You are a scene designer tasked with creating sparse frames of a video. You will be given a prompt describing the desired video, and your goal is to design only the key frames (sparse frames) that represent major changes in the scene. Do not include repetitive or similar scenes—only capture distinct moments.
Expected Format:
Return the response as a JSON object with the key "frames". The value should be a list of dictionaries, where each dictionary has:
"frame_index": an integer indicating the frame's position in the sequence.
"description": a brief description of the scene in this frame.
Example:
If given a prompt like "A sunset over a beach with waves crashing and a ship sailing by," your response should look like this:
```json
{
"frames": [
{"frame_index": 0, "description": "Sunset over an empty beach, sky turning orange and pink"},
{"frame_index": 30, "description": "Waves gently crashing on the shore"},
{"frame_index": 60, "description": "A ship appears on the horizon, silhouetted by the sunset"},
{"frame_index": 90, "description": "Ship sailing closer, with waves becoming more dynamic"},
{"frame_index": 120, "description": "Sun dipping below the horizon, casting a golden glow over the water"}
]
}
```
This way, each frame represents a distinct scene, and there’s no redundancy between them."""},
{"role": "user", "content": f"give me the frames to generate a video with prompt : `{prompt}`"},]
frames = ask_gpt(massage_history,return_str=False)['frames']
conditioning_frames = []
controlnet_frame_indices =[]
long_prompt = {}
for frame in frames:
conditioning_frames.append(generate_image(frame['description'], reference_image, float(controlnet_conditioning_scale)))
controlnet_frame_indices.append(frame['frame_index'])
long_prompt[frame['frame_index']] = frame['description']
video = gif_pipe(
prompt=long_prompt,
negative_prompt="low quality, worst quality",
num_inference_steps=25,
conditioning_frames=conditioning_frames,
controlnet_frame_indices=controlnet_frame_indices,
controlnet_conditioning_scale=float(controlnet_conditioning_scale),
).frames[0]
export_to_gif(video, "output.gif")
return "animation.gif"
# Set up Gradio interface
interface = gr.Interface(
fn=generate_gif,
inputs=[
gr.Textbox(label="Prompt"),
# gr.Image( type= "filepath",label="Reference Image (Style)"),
gr.File(type="filepath",file_count="multiple",label="Reference Image (Style)"),
gr.Slider(label="Control Net Conditioning Scale", minimum=0, maximum=1.0, step=0.1, value=1.0),
gr.Slider(label="Number of frames", minimum=0, maximum=100.0, step=1.0, value=10.0),
],
outputs="image",
title="Image Generation with Stable Diffusion 3 medium and ControlNet",
description="Generates an image based on a text prompt and a reference image using Stable Diffusion 3 medium with ControlNet."
)
# # Set up Gradio interface
# interface = gr.Interface(
# fn=generate_frames,
# inputs=[
# gr.Textbox(label="Prompt"),
# # gr.Image( type= "filepath",label="Reference Image (Style)"),
# gr.File(type="filepath",file_count="multiple",label="Reference Image (Style)"),
# gr.Slider(label="Control Net Conditioning Scale", minimum=0, maximum=1.0, step=0.1, value=1.0),
# gr.Slider(label="Number of frames", minimum=0, maximum=1.0, step=0.1, value=1.0),
#
# ],
# outputs="gallery",
# title="Image Generation with Stable Diffusion 3 medium and ControlNet",
# description="Generates an image based on a text prompt and a reference image using Stable Diffusion 3 medium with ControlNet."
#
# )
interface.launch()
|