Spaces:

amos1088
/

test_gradio

Paused

File size: 8,759 Bytes

683afc3
c1497a6
0737dc8
a4cc7b2
 
58c4ba9
 
 
 
38e6a4b
 
 
 
4fbc46c
c1497a6
38e6a4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
683afc3
38e6a4b
 
 
 
 
 
 
 
 
b12bc82
58c4ba9
 
 
 
 
 
a4cc7b2
58c4ba9
 
 
 
a4cc7b2
 
 
58c4ba9
 
a4cc7b2
38e6a4b
a4cc7b2
58c4ba9
 
 
a4cc7b2
 
58c4ba9
38e6a4b
2fd610d
38e6a4b
 
bcbf6e0
c3fd50e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0737dc8
74c4e79
38e6a4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fd610d
38e6a4b
1103202
38e6a4b
2fd610d
97c3973
38e6a4b
2fd610d
58c4ba9
 
38e6a4b
 
1103202
58c4ba9
 
97c3973
a4cc7b2
683afc3
2fd610d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c3fd50e
 
2fd610d
c3fd50e
 
 
 
 
 
 
 
2fd610d
c3fd50e
 
 
 
 
7968596
a4cc7b2

import gradio as gr
from huggingface_hub import login
import os
import spaces,tempfile
import torch
from diffusers import AnimateDiffSparseControlNetPipeline
from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
from diffusers.schedulers import DPMSolverMultistepScheduler
from diffusers.utils import export_to_gif, load_image
from diffusers import AutoPipelineForText2Image
import openai,json


token = os.getenv("HF_TOKEN")
login(token=token)
openai_token = os.getenv("OPENAI_TOKEN")
openai.api_key = openai_token
openaiclient = openai.OpenAI(api_key=openai.api_key)

def ask_gpt(massage_history,model="gpt-4o-mini",return_str=True,response_format={"type": "json_object"}):
    response = openaiclient.chat.completions.create(
      model=model,
      messages=massage_history,
      response_format=response_format,
      max_tokens=4000,    )

    if return_str:
        return response.choices[0].message.content
    else:
        return json.loads(response.choices[0].message.content)


image_pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16).to("cuda")
image_pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")



@spaces.GPU
def generate_image(prompt, reference_image, controlnet_conditioning_scale):
    style_images = [load_image(f.name) for f in reference_image]

    image_pipeline.set_ip_adapter_scale(controlnet_conditioning_scale)

    image = image_pipeline(
        prompt=prompt,
        ip_adapter_image=[style_images],
        negative_prompt="",
        guidance_scale=5,
        num_inference_steps=30,
    ).images[0]

    return image

model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
controlnet_id = "guoyww/animatediff-sparsectrl-rgb"
lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
vae_id = "stabilityai/sd-vae-ft-mse"
device = "cuda"

motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
scheduler = DPMSolverMultistepScheduler.from_pretrained(
    model_id,
    subfolder="scheduler",
    beta_schedule="linear",
    algorithm_type="dpmsolver++",
    use_karras_sigmas=True,
)
gif_pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
    model_id,
    motion_adapter=motion_adapter,
    controlnet=controlnet,
    vae=vae,
    scheduler=scheduler,
    torch_dtype=torch.float16,
).to(device)
gif_pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
gif_pipe.enable_free_noise(context_length=16, context_stride=4)



@spaces.GPU
def generate_frames(prompt, reference_image, controlnet_conditioning_scale,num_frames):
    massage_history = [{"role": "system", "content": """
You are a scene designer tasked with creating sparse frames of a video. You will be given a prompt describing the desired video, and your goal is to design only the key frames (sparse frames) that represent major changes in the scene. Do not include repetitive or similar scenes—only capture distinct moments.

Expected Format:
Return the response as a JSON object with the key "frames". The value should be a list of dictionaries, where each dictionary has:

"frame_index": an integer indicating the frame's position in the sequence.
"description": a brief description of the scene in this frame.
Example:
If given a prompt like "A sunset over a beach with waves crashing and a ship sailing by," your response should look like this:

```json
{
  "frames": [
    {"frame_index": 0, "description": "Sunset over an empty beach, sky turning orange and pink"},
    {"frame_index": 30, "description": "Waves gently crashing on the shore"},
    {"frame_index": 60, "description": "A ship appears on the horizon, silhouetted by the sunset"},
    {"frame_index": 90, "description": "Ship sailing closer, with waves becoming more dynamic"},
    {"frame_index": 120, "description": "Sun dipping below the horizon, casting a golden glow over the water"}
  ]
}
```
This way, each frame represents a distinct scene, and there’s no redundancy between them."""},
                       {"role": "user", "content": f"give me the frames to generate a video with prompt : `{prompt}`"},]
    frames = ask_gpt(massage_history,return_str=False)['frames']
    conditioning_frames = []
    controlnet_frame_indices =[]
    for frame in frames:
        conditioning_frames.append(generate_image(frame['description'], reference_image, controlnet_conditioning_scale))
        controlnet_frame_indices.append(frame['frame_index'])
    return conditioning_frames

@spaces.GPU
def generate_gif(prompt, reference_image, controlnet_conditioning_scale,num_frames):
    massage_history = [{"role": "system", "content": """
You are a scene designer tasked with creating sparse frames of a video. You will be given a prompt describing the desired video, and your goal is to design only the key frames (sparse frames) that represent major changes in the scene. Do not include repetitive or similar scenes—only capture distinct moments.

Expected Format:
Return the response as a JSON object with the key "frames". The value should be a list of dictionaries, where each dictionary has:

"frame_index": an integer indicating the frame's position in the sequence.
"description": a brief description of the scene in this frame.
Example:
If given a prompt like "A sunset over a beach with waves crashing and a ship sailing by," your response should look like this:

```json
{
  "frames": [
    {"frame_index": 0, "description": "Sunset over an empty beach, sky turning orange and pink"},
    {"frame_index": 30, "description": "Waves gently crashing on the shore"},
    {"frame_index": 60, "description": "A ship appears on the horizon, silhouetted by the sunset"},
    {"frame_index": 90, "description": "Ship sailing closer, with waves becoming more dynamic"},
    {"frame_index": 120, "description": "Sun dipping below the horizon, casting a golden glow over the water"}
  ]
}
```
This way, each frame represents a distinct scene, and there’s no redundancy between them."""},
                       {"role": "user", "content": f"give me the frames to generate a video with prompt : `{prompt}`"},]
    frames = ask_gpt(massage_history,return_str=False)['frames']
    conditioning_frames = []
    controlnet_frame_indices =[]
    long_prompt = {}
    for frame in frames:
        conditioning_frames.append(generate_image(frame['description'], reference_image, float(controlnet_conditioning_scale)))
        controlnet_frame_indices.append(frame['frame_index'])
        long_prompt[frame['frame_index']] = frame['description']

    video = gif_pipe(
        prompt=long_prompt,
        negative_prompt="low quality, worst quality",
        num_inference_steps=25,
        conditioning_frames=conditioning_frames,
        controlnet_frame_indices=controlnet_frame_indices,
        controlnet_conditioning_scale=float(controlnet_conditioning_scale),
    ).frames[0]
    export_to_gif(video, "output.gif")

    return "animation.gif"

# Set up Gradio interface
interface = gr.Interface(
    fn=generate_gif,
    inputs=[
        gr.Textbox(label="Prompt"),
        # gr.Image( type= "filepath",label="Reference Image (Style)"),
        gr.File(type="filepath",file_count="multiple",label="Reference Image (Style)"),
        gr.Slider(label="Control Net Conditioning Scale", minimum=0, maximum=1.0, step=0.1, value=1.0),
        gr.Slider(label="Number of frames", minimum=0, maximum=100.0, step=1.0, value=10.0),

    ],
    outputs="image",
    title="Image Generation with Stable Diffusion 3 medium and ControlNet",
    description="Generates an image based on a text prompt and a reference image using Stable Diffusion 3 medium with ControlNet."

)

# # Set up Gradio interface
# interface = gr.Interface(
#     fn=generate_frames,
#     inputs=[
#         gr.Textbox(label="Prompt"),
#         # gr.Image( type= "filepath",label="Reference Image (Style)"),
#         gr.File(type="filepath",file_count="multiple",label="Reference Image (Style)"),
#         gr.Slider(label="Control Net Conditioning Scale", minimum=0, maximum=1.0, step=0.1, value=1.0),
#         gr.Slider(label="Number of frames", minimum=0, maximum=1.0, step=0.1, value=1.0),
#
#     ],
#     outputs="gallery",
#     title="Image Generation with Stable Diffusion 3 medium and ControlNet",
#     description="Generates an image based on a text prompt and a reference image using Stable Diffusion 3 medium with ControlNet."
#
# )

interface.launch()