Spaces:
Runtime error
Runtime error
nice demo
Browse files- README.md +0 -14
- app.py +17 -0
- app_bokehK.py +77 -0
- app_color_temperature.py +77 -0
- app_focal_length.py +77 -0
- app_shutter_speed.py +77 -0
- configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml +5 -8
- configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml +4 -7
- configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml +5 -7
- configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml +4 -7
- inference_bokehK.py +13 -22
- inference_color_temperature.py +21 -96
- inference_focal_length.py +22 -95
- inference_shutter_speed.py +26 -100
- requirements.txt +5 -5
README.md
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Generative Photography
|
| 3 |
-
emoji: 📈
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: blue
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 5.20.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
license: cc-by-nc-nd-4.0
|
| 11 |
-
short_description: Demo for Generative Photography
|
| 12 |
-
---
|
| 13 |
-
|
| 14 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,11 +1,28 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import json
|
| 3 |
import torch
|
|
|
|
|
|
|
| 4 |
from inference_bokehK import load_models as load_bokeh_models, run_inference as run_bokeh_inference, OmegaConf
|
| 5 |
from inference_focal_length import load_models as load_focal_models, run_inference as run_focal_inference
|
| 6 |
from inference_shutter_speed import load_models as load_shutter_models, run_inference as run_shutter_inference
|
| 7 |
from inference_color_temperature import load_models as load_color_models, run_inference as run_color_inference
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
torch.manual_seed(42)
|
| 10 |
|
| 11 |
bokeh_cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml")
|
|
|
|
| 1 |
+
import os
|
| 2 |
import gradio as gr
|
| 3 |
import json
|
| 4 |
import torch
|
| 5 |
+
from huggingface_hub import snapshot_download
|
| 6 |
+
|
| 7 |
from inference_bokehK import load_models as load_bokeh_models, run_inference as run_bokeh_inference, OmegaConf
|
| 8 |
from inference_focal_length import load_models as load_focal_models, run_inference as run_focal_inference
|
| 9 |
from inference_shutter_speed import load_models as load_shutter_models, run_inference as run_shutter_inference
|
| 10 |
from inference_color_temperature import load_models as load_color_models, run_inference as run_color_inference
|
| 11 |
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
model_path = "ckpts"
|
| 16 |
+
os.makedirs(model_path, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
print("Downloading models from Hugging Face...")
|
| 20 |
+
snapshot_download(repo_id="pandaphd/generative_photography", local_dir=model_path)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
torch.manual_seed(42)
|
| 27 |
|
| 28 |
bokeh_cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml")
|
app_bokehK.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import tempfile
|
| 3 |
+
import json
|
| 4 |
+
from inference_bokehK import load_models, run_inference, OmegaConf
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
# Initialize models once at startup
|
| 8 |
+
cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml")
|
| 9 |
+
pipeline, device = load_models(cfg)
|
| 10 |
+
|
| 11 |
+
def generate_video(base_scene, bokehK_list):
|
| 12 |
+
try:
|
| 13 |
+
# Validate input
|
| 14 |
+
if len(json.loads(bokehK_list)) != 5:
|
| 15 |
+
raise ValueError("Exactly 5 Bokeh K values required")
|
| 16 |
+
|
| 17 |
+
# Run inference
|
| 18 |
+
video_path = run_inference(
|
| 19 |
+
pipeline=pipeline,
|
| 20 |
+
tokenizer=pipeline.tokenizer,
|
| 21 |
+
text_encoder=pipeline.text_encoder,
|
| 22 |
+
base_scene=base_scene,
|
| 23 |
+
bokehK_list=bokehK_list,
|
| 24 |
+
device=device
|
| 25 |
+
)
|
| 26 |
+
return video_path
|
| 27 |
+
|
| 28 |
+
except Exception as e:
|
| 29 |
+
raise gr.Error(f"Generation failed: {str(e)}")
|
| 30 |
+
|
| 31 |
+
# Example inputs
|
| 32 |
+
examples = [
|
| 33 |
+
[
|
| 34 |
+
"A young boy wearing an orange jacket is standing on a crosswalk, waiting to cross the street.",
|
| 35 |
+
"[2.5, 6.3, 10.1, 17.2, 24.0]"
|
| 36 |
+
],
|
| 37 |
+
[
|
| 38 |
+
"A display of frozen desserts, including cupcakes and donuts, is arranged in a row on a counter.",
|
| 39 |
+
"[20.0, 18.5, 15.0, 10.5, 5.0]"
|
| 40 |
+
]
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
with gr.Blocks(title="Bokeh Effect Generator") as demo:
|
| 44 |
+
gr.Markdown("#Dynamic Bokeh Effect Generation")
|
| 45 |
+
|
| 46 |
+
with gr.Row():
|
| 47 |
+
with gr.Column():
|
| 48 |
+
scene_input = gr.Textbox(
|
| 49 |
+
label="Scene Description",
|
| 50 |
+
placeholder="Describe the scene you want to generate..."
|
| 51 |
+
)
|
| 52 |
+
bokeh_input = gr.Textbox(
|
| 53 |
+
label="Bokeh Blur Values",
|
| 54 |
+
placeholder="Enter 5 comma-separated values from 1-30 (e.g., [2.44, 8.3, 10.1, 17.2, 24.0])"
|
| 55 |
+
)
|
| 56 |
+
submit_btn = gr.Button("Generate Video", variant="primary")
|
| 57 |
+
|
| 58 |
+
with gr.Column():
|
| 59 |
+
video_output = gr.Video(label="Generated Video")
|
| 60 |
+
error_output = gr.Textbox(label="Error Messages", visible=False)
|
| 61 |
+
|
| 62 |
+
gr.Examples(
|
| 63 |
+
examples=examples,
|
| 64 |
+
inputs=[scene_input, bokeh_input],
|
| 65 |
+
outputs=[video_output],
|
| 66 |
+
fn=generate_video,
|
| 67 |
+
cache_examples=True
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
submit_btn.click(
|
| 71 |
+
fn=generate_video,
|
| 72 |
+
inputs=[scene_input, bokeh_input],
|
| 73 |
+
outputs=[video_output],
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
demo.launch(share=True)
|
app_color_temperature.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import tempfile
|
| 3 |
+
import json
|
| 4 |
+
from inference_color_temperature import load_models, run_inference, OmegaConf
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
# Initialize models once at startup
|
| 8 |
+
cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml")
|
| 9 |
+
pipeline, device = load_models(cfg)
|
| 10 |
+
|
| 11 |
+
def generate_video(base_scene, color_temperature_list):
|
| 12 |
+
try:
|
| 13 |
+
# Validate input
|
| 14 |
+
if len(json.loads(color_temperature_list)) != 5:
|
| 15 |
+
raise ValueError("Exactly 5 color_temperature values required")
|
| 16 |
+
|
| 17 |
+
# Run inference
|
| 18 |
+
video_path = run_inference(
|
| 19 |
+
pipeline=pipeline,
|
| 20 |
+
tokenizer=pipeline.tokenizer,
|
| 21 |
+
text_encoder=pipeline.text_encoder,
|
| 22 |
+
base_scene=base_scene,
|
| 23 |
+
color_temperature_list=color_temperature_list,
|
| 24 |
+
device=device
|
| 25 |
+
)
|
| 26 |
+
return video_path
|
| 27 |
+
|
| 28 |
+
except Exception as e:
|
| 29 |
+
raise gr.Error(f"Generation failed: {str(e)}")
|
| 30 |
+
|
| 31 |
+
# Example inputs
|
| 32 |
+
examples = [
|
| 33 |
+
[
|
| 34 |
+
"A beautiful blue sky with a mountain range in the background.",
|
| 35 |
+
"[5455.0, 5155.0, 5555.0, 6555.0, 7555.0]"
|
| 36 |
+
],
|
| 37 |
+
[
|
| 38 |
+
"A red couch is situated in front of a window, which is filled with a variety of potted plants.",
|
| 39 |
+
"[3500.0, 5500.0, 6500.0, 7500.0, 8500.0]"
|
| 40 |
+
]
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
with gr.Blocks(title="Color Temperature Effect Generator") as demo:
|
| 44 |
+
gr.Markdown("# Dynamic Color Temperature Effect Generation")
|
| 45 |
+
|
| 46 |
+
with gr.Row():
|
| 47 |
+
with gr.Column():
|
| 48 |
+
scene_input = gr.Textbox(
|
| 49 |
+
label="Scene Description",
|
| 50 |
+
placeholder="Describe the scene you want to generate..."
|
| 51 |
+
)
|
| 52 |
+
color_temperature_input = gr.Textbox(
|
| 53 |
+
label="Color Temperature Values",
|
| 54 |
+
placeholder="Enter 5 comma-separated values from 2000-10000 (e.g., [3001.3, 4000.2, 4400.34, 5488.23, 8888.82])"
|
| 55 |
+
)
|
| 56 |
+
submit_btn = gr.Button("Generate Video", variant="primary")
|
| 57 |
+
|
| 58 |
+
with gr.Column():
|
| 59 |
+
video_output = gr.Video(label="Generated Video")
|
| 60 |
+
error_output = gr.Textbox(label="Error Messages", visible=False)
|
| 61 |
+
|
| 62 |
+
gr.Examples(
|
| 63 |
+
examples=examples,
|
| 64 |
+
inputs=[scene_input, color_temperature_input],
|
| 65 |
+
outputs=[video_output],
|
| 66 |
+
fn=generate_video,
|
| 67 |
+
cache_examples=True
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
submit_btn.click(
|
| 71 |
+
fn=generate_video,
|
| 72 |
+
inputs=[scene_input, color_temperature_input],
|
| 73 |
+
outputs=[video_output],
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
demo.launch(share=True)
|
app_focal_length.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import tempfile
|
| 3 |
+
import json
|
| 4 |
+
from inference_focal_length import load_models, run_inference, OmegaConf
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
# Initialize models once at startup
|
| 8 |
+
cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml")
|
| 9 |
+
pipeline, device = load_models(cfg)
|
| 10 |
+
|
| 11 |
+
def generate_video(base_scene, focal_length_list):
|
| 12 |
+
try:
|
| 13 |
+
# Validate input
|
| 14 |
+
if len(json.loads(focal_length_list)) != 5:
|
| 15 |
+
raise ValueError("Exactly 5 focal_length values required")
|
| 16 |
+
|
| 17 |
+
# Run inference
|
| 18 |
+
video_path = run_inference(
|
| 19 |
+
pipeline=pipeline,
|
| 20 |
+
tokenizer=pipeline.tokenizer,
|
| 21 |
+
text_encoder=pipeline.text_encoder,
|
| 22 |
+
base_scene=base_scene,
|
| 23 |
+
focal_length_list=focal_length_list,
|
| 24 |
+
device=device
|
| 25 |
+
)
|
| 26 |
+
return video_path
|
| 27 |
+
|
| 28 |
+
except Exception as e:
|
| 29 |
+
raise gr.Error(f"Generation failed: {str(e)}")
|
| 30 |
+
|
| 31 |
+
# Example inputs
|
| 32 |
+
examples = [
|
| 33 |
+
[
|
| 34 |
+
"A small office cubicle with a desk, computer, and chair.",
|
| 35 |
+
"[25.1, 36.1, 47.1, 58.1, 69.1]"
|
| 36 |
+
],
|
| 37 |
+
[
|
| 38 |
+
"A large, white couch is placed in a living room, with a mirror above it. The couch is covered with various items, including a blue box, a pink towel, and a pair of shoes.",
|
| 39 |
+
"[55.0, 46.0, 37.0, 28.0, 25.0]"
|
| 40 |
+
]
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
with gr.Blocks(title="Focal Length Effect Generator") as demo:
|
| 44 |
+
gr.Markdown("#Dynamic Focal Length Effect Generation")
|
| 45 |
+
|
| 46 |
+
with gr.Row():
|
| 47 |
+
with gr.Column():
|
| 48 |
+
scene_input = gr.Textbox(
|
| 49 |
+
label="Scene Description",
|
| 50 |
+
placeholder="Describe the scene you want to generate..."
|
| 51 |
+
)
|
| 52 |
+
focal_length_input = gr.Textbox(
|
| 53 |
+
label="Focal Length Values",
|
| 54 |
+
placeholder="Enter 5 comma-separated values from 24-70 (e.g., [25.1, 30.2, 33.3, 40.8, 54.0])"
|
| 55 |
+
)
|
| 56 |
+
submit_btn = gr.Button("Generate Video", variant="primary")
|
| 57 |
+
|
| 58 |
+
with gr.Column():
|
| 59 |
+
video_output = gr.Video(label="Generated Video")
|
| 60 |
+
error_output = gr.Textbox(label="Error Messages", visible=False)
|
| 61 |
+
|
| 62 |
+
gr.Examples(
|
| 63 |
+
examples=examples,
|
| 64 |
+
inputs=[scene_input, focal_length_input],
|
| 65 |
+
outputs=[video_output],
|
| 66 |
+
fn=generate_video,
|
| 67 |
+
cache_examples=True
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
submit_btn.click(
|
| 71 |
+
fn=generate_video,
|
| 72 |
+
inputs=[scene_input, focal_length_input],
|
| 73 |
+
outputs=[video_output],
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
demo.launch(share=True)
|
app_shutter_speed.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import tempfile
|
| 3 |
+
import json
|
| 4 |
+
from inference_shutter_speed import load_models, run_inference, OmegaConf
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
# Initialize models once at startup
|
| 8 |
+
cfg = OmegaConf.load("configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml")
|
| 9 |
+
pipeline, device = load_models(cfg)
|
| 10 |
+
|
| 11 |
+
def generate_video(base_scene, shutter_speed_list):
|
| 12 |
+
try:
|
| 13 |
+
# Validate input
|
| 14 |
+
if len(json.loads(shutter_speed_list)) != 5:
|
| 15 |
+
raise ValueError("Exactly 5 shutter_speed values required")
|
| 16 |
+
|
| 17 |
+
# Run inference
|
| 18 |
+
video_path = run_inference(
|
| 19 |
+
pipeline=pipeline,
|
| 20 |
+
tokenizer=pipeline.tokenizer,
|
| 21 |
+
text_encoder=pipeline.text_encoder,
|
| 22 |
+
base_scene=base_scene,
|
| 23 |
+
shutter_speed_list=shutter_speed_list,
|
| 24 |
+
device=device
|
| 25 |
+
)
|
| 26 |
+
return video_path
|
| 27 |
+
|
| 28 |
+
except Exception as e:
|
| 29 |
+
raise gr.Error(f"Generation failed: {str(e)}")
|
| 30 |
+
|
| 31 |
+
# Example inputs
|
| 32 |
+
examples = [
|
| 33 |
+
[
|
| 34 |
+
"A brown and orange leather handbag with a paw print on it sits next to a book.",
|
| 35 |
+
"[0.11, 0.22, 0.33, 0.44, 0.55]"
|
| 36 |
+
],
|
| 37 |
+
[
|
| 38 |
+
"A variety of potted plants are displayed on a windowsill, with some of them placed in yellow and white bowls. ",
|
| 39 |
+
"[0.29, 0.49, 0.69, 0.79, 0.89]"
|
| 40 |
+
]
|
| 41 |
+
]
|
| 42 |
+
|
| 43 |
+
with gr.Blocks(title="Shutter Speed Effect Generator") as demo:
|
| 44 |
+
gr.Markdown("#Dynamic Shutter Speed Effect Generation")
|
| 45 |
+
|
| 46 |
+
with gr.Row():
|
| 47 |
+
with gr.Column():
|
| 48 |
+
scene_input = gr.Textbox(
|
| 49 |
+
label="Scene Description",
|
| 50 |
+
placeholder="Describe the scene you want to generate..."
|
| 51 |
+
)
|
| 52 |
+
shutter_speed_input = gr.Textbox(
|
| 53 |
+
label="Shutter Speed Values",
|
| 54 |
+
placeholder="Enter 5 comma-separated values from 0.1-1.0 (e.g., [0.15, 0.32, 0.53, 0.62, 0.82])"
|
| 55 |
+
)
|
| 56 |
+
submit_btn = gr.Button("Generate Video", variant="primary")
|
| 57 |
+
|
| 58 |
+
with gr.Column():
|
| 59 |
+
video_output = gr.Video(label="Generated Video")
|
| 60 |
+
error_output = gr.Textbox(label="Error Messages", visible=False)
|
| 61 |
+
|
| 62 |
+
gr.Examples(
|
| 63 |
+
examples=examples,
|
| 64 |
+
inputs=[scene_input, shutter_speed_input],
|
| 65 |
+
outputs=[video_output],
|
| 66 |
+
fn=generate_video,
|
| 67 |
+
cache_examples=True
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
submit_btn.click(
|
| 71 |
+
fn=generate_video,
|
| 72 |
+
inputs=[scene_input, shutter_speed_input],
|
| 73 |
+
outputs=[video_output],
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
if __name__ == "__main__":
|
| 77 |
+
demo.launch(share=True)
|
configs/inference_genphoto/adv3_256_384_genphoto_relora_bokehK.yaml
CHANGED
|
@@ -1,13 +1,11 @@
|
|
| 1 |
-
output_dir: "inference_output/genphoto_bokehK"
|
| 2 |
|
| 3 |
-
pretrained_model_repo: "pandaphd/generative_photography"
|
| 4 |
-
pretrained_model_path: "stable-diffusion-v1-5"
|
| 5 |
|
|
|
|
| 6 |
unet_subfolder: "unet_merged"
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
camera_adaptor_ckpt: "weights/checkpoint-bokehK.ckpt"
|
| 9 |
-
lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
|
| 10 |
-
motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
|
| 11 |
|
| 12 |
lora_rank: 2
|
| 13 |
lora_scale: 1.0
|
|
@@ -43,7 +41,6 @@ camera_encoder_kwargs:
|
|
| 43 |
attention_block_types: ["Temporal_Self", ]
|
| 44 |
temporal_position_encoding: true
|
| 45 |
temporal_position_encoding_max_len: 16
|
| 46 |
-
|
| 47 |
attention_processor_kwargs:
|
| 48 |
add_spatial: false
|
| 49 |
spatial_attn_names: 'attn1'
|
|
@@ -53,7 +50,6 @@ attention_processor_kwargs:
|
|
| 53 |
query_condition: true
|
| 54 |
key_value_condition: true
|
| 55 |
scale: 1.0
|
| 56 |
-
|
| 57 |
noise_scheduler_kwargs:
|
| 58 |
num_train_timesteps: 1000
|
| 59 |
beta_start: 0.00085
|
|
@@ -62,5 +58,6 @@ noise_scheduler_kwargs:
|
|
| 62 |
steps_offset: 1
|
| 63 |
clip_sample: false
|
| 64 |
|
|
|
|
| 65 |
num_workers: 8
|
| 66 |
global_seed: 42
|
|
|
|
|
|
|
| 1 |
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
pretrained_model_path: "./ckpts/stable-diffusion-v1-5/"
|
| 4 |
unet_subfolder: "unet_merged"
|
| 5 |
+
camera_adaptor_ckpt: "./ckpts/weights/checkpoint-bokehK.ckpt"
|
| 6 |
+
lora_ckpt: "./ckpts/weights/RealEstate10K_LoRA.ckpt"
|
| 7 |
+
motion_module_ckpt: "./ckpts/weights/v3_sd15_mm.ckpt"
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
lora_rank: 2
|
| 11 |
lora_scale: 1.0
|
|
|
|
| 41 |
attention_block_types: ["Temporal_Self", ]
|
| 42 |
temporal_position_encoding: true
|
| 43 |
temporal_position_encoding_max_len: 16
|
|
|
|
| 44 |
attention_processor_kwargs:
|
| 45 |
add_spatial: false
|
| 46 |
spatial_attn_names: 'attn1'
|
|
|
|
| 50 |
query_condition: true
|
| 51 |
key_value_condition: true
|
| 52 |
scale: 1.0
|
|
|
|
| 53 |
noise_scheduler_kwargs:
|
| 54 |
num_train_timesteps: 1000
|
| 55 |
beta_start: 0.00085
|
|
|
|
| 58 |
steps_offset: 1
|
| 59 |
clip_sample: false
|
| 60 |
|
| 61 |
+
|
| 62 |
num_workers: 8
|
| 63 |
global_seed: 42
|
configs/inference_genphoto/adv3_256_384_genphoto_relora_color_temperature.yaml
CHANGED
|
@@ -1,16 +1,13 @@
|
|
| 1 |
output_dir: "inference_output/genphoto_color_temperature"
|
| 2 |
-
|
| 3 |
-
pretrained_model_repo: "pandaphd/generative_photography"
|
| 4 |
-
pretrained_model_path: "stable-diffusion-v1-5"
|
| 5 |
-
|
| 6 |
unet_subfolder: "unet_merged"
|
| 7 |
|
| 8 |
-
camera_adaptor_ckpt: "weights/checkpoint-color_temperature.ckpt"
|
| 9 |
-
lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
|
| 10 |
-
motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
|
| 11 |
|
| 12 |
lora_rank: 2
|
| 13 |
lora_scale: 1.0
|
|
|
|
|
|
|
| 14 |
motion_lora_rank: 0
|
| 15 |
motion_lora_scale: 1.0
|
| 16 |
|
|
|
|
| 1 |
output_dir: "inference_output/genphoto_color_temperature"
|
| 2 |
+
pretrained_model_path: "./ckpts/stable-diffusion-v1-5/"
|
|
|
|
|
|
|
|
|
|
| 3 |
unet_subfolder: "unet_merged"
|
| 4 |
|
| 5 |
+
camera_adaptor_ckpt: "./ckpts/weights/checkpoint-color_temperature.ckpt"
|
|
|
|
|
|
|
| 6 |
|
| 7 |
lora_rank: 2
|
| 8 |
lora_scale: 1.0
|
| 9 |
+
lora_ckpt: "./ckpts/weights/RealEstate10K_LoRA.ckpt"
|
| 10 |
+
motion_module_ckpt: "./ckpts/weights/v3_sd15_mm.ckpt"
|
| 11 |
motion_lora_rank: 0
|
| 12 |
motion_lora_scale: 1.0
|
| 13 |
|
configs/inference_genphoto/adv3_256_384_genphoto_relora_focal_length.yaml
CHANGED
|
@@ -1,16 +1,14 @@
|
|
| 1 |
output_dir: "inference_output/genphoto_focal_length"
|
| 2 |
-
|
| 3 |
-
pretrained_model_repo: "pandaphd/generative_photography"
|
| 4 |
-
pretrained_model_path: "stable-diffusion-v1-5"
|
| 5 |
-
|
| 6 |
unet_subfolder: "unet_merged"
|
| 7 |
|
| 8 |
-
camera_adaptor_ckpt: "weights/checkpoint-focal_length.ckpt"
|
| 9 |
-
|
| 10 |
-
motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
|
| 11 |
|
| 12 |
lora_rank: 2
|
| 13 |
lora_scale: 1.0
|
|
|
|
|
|
|
| 14 |
motion_lora_rank: 0
|
| 15 |
motion_lora_scale: 1.0
|
| 16 |
|
|
|
|
| 1 |
output_dir: "inference_output/genphoto_focal_length"
|
| 2 |
+
pretrained_model_path: "./ckpts/stable-diffusion-v1-5/"
|
|
|
|
|
|
|
|
|
|
| 3 |
unet_subfolder: "unet_merged"
|
| 4 |
|
| 5 |
+
camera_adaptor_ckpt: "./ckpts/weights/checkpoint-focal_length.ckpt"
|
| 6 |
+
|
|
|
|
| 7 |
|
| 8 |
lora_rank: 2
|
| 9 |
lora_scale: 1.0
|
| 10 |
+
lora_ckpt: "./ckpts/weights/RealEstate10K_LoRA.ckpt"
|
| 11 |
+
motion_module_ckpt: "./ckpts/weights/v3_sd15_mm.ckpt"
|
| 12 |
motion_lora_rank: 0
|
| 13 |
motion_lora_scale: 1.0
|
| 14 |
|
configs/inference_genphoto/adv3_256_384_genphoto_relora_shutter_speed.yaml
CHANGED
|
@@ -1,16 +1,13 @@
|
|
| 1 |
output_dir: "inference_output/genphoto_shutter_speed"
|
| 2 |
-
|
| 3 |
-
pretrained_model_repo: "pandaphd/generative_photography"
|
| 4 |
-
pretrained_model_path: "stable-diffusion-v1-5"
|
| 5 |
-
|
| 6 |
unet_subfolder: "unet_merged"
|
| 7 |
|
| 8 |
-
camera_adaptor_ckpt: "weights/checkpoint-shutter_speed.ckpt"
|
| 9 |
-
lora_ckpt: "weights/RealEstate10K_LoRA.ckpt"
|
| 10 |
-
motion_module_ckpt: "weights/v3_sd15_mm.ckpt"
|
| 11 |
|
| 12 |
lora_rank: 2
|
| 13 |
lora_scale: 1.0
|
|
|
|
|
|
|
| 14 |
motion_lora_rank: 0
|
| 15 |
motion_lora_scale: 1.0
|
| 16 |
|
|
|
|
| 1 |
output_dir: "inference_output/genphoto_shutter_speed"
|
| 2 |
+
pretrained_model_path: "./ckpts/stable-diffusion-v1-5/"
|
|
|
|
|
|
|
|
|
|
| 3 |
unet_subfolder: "unet_merged"
|
| 4 |
|
| 5 |
+
camera_adaptor_ckpt: "./ckpts/weights/checkpoint-shutter_speed.ckpt"
|
|
|
|
|
|
|
| 6 |
|
| 7 |
lora_rank: 2
|
| 8 |
lora_scale: 1.0
|
| 9 |
+
lora_ckpt: "./ckpts/weights/RealEstate10K_LoRA.ckpt"
|
| 10 |
+
motion_module_ckpt: "./ckpts/weights/v3_sd15_mm.ckpt"
|
| 11 |
motion_lora_rank: 0
|
| 12 |
motion_lora_scale: 1.0
|
| 13 |
|
inference_bokehK.py
CHANGED
|
@@ -22,11 +22,6 @@ from genphoto.utils.util import save_videos_grid
|
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
| 25 |
-
|
| 26 |
-
from huggingface_hub import hf_hub_download
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
def create_bokehK_embedding(bokehK_values, target_height, target_width):
|
| 31 |
f = bokehK_values.shape[0]
|
| 32 |
bokehK_embedding = torch.zeros((f, 3, target_height, target_width), dtype=bokehK_values.dtype)
|
|
@@ -94,24 +89,18 @@ class Camera_Embedding(Dataset):
|
|
| 94 |
camera_embedding = torch.cat((bokehK_embedding, ccl_embedding), dim=1)
|
| 95 |
return camera_embedding
|
| 96 |
|
|
|
|
| 97 |
def load_models(cfg):
|
| 98 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 99 |
|
| 100 |
-
pretrained_model_path = hf_hub_download("pandaphd/generative_photography", "stable-diffusion-v1-5/")
|
| 101 |
-
lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
|
| 102 |
-
motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
|
| 103 |
-
camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-bokehK.ckpt")
|
| 104 |
-
|
| 105 |
noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 106 |
-
vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
|
| 107 |
vae.requires_grad_(False)
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
|
| 111 |
text_encoder.requires_grad_(False)
|
| 112 |
-
|
| 113 |
unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 114 |
-
pretrained_model_path,
|
| 115 |
subfolder=cfg.unet_subfolder,
|
| 116 |
unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 117 |
).to(device)
|
|
@@ -132,26 +121,26 @@ def load_models(cfg):
|
|
| 132 |
)
|
| 133 |
|
| 134 |
if cfg.lora_ckpt is not None:
|
| 135 |
-
lora_checkpoints = torch.load(
|
| 136 |
if 'lora_state_dict' in lora_checkpoints.keys():
|
| 137 |
lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 138 |
_, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 139 |
assert len(lora_u) == 0
|
| 140 |
|
| 141 |
if cfg.motion_module_ckpt is not None:
|
| 142 |
-
mm_checkpoints = torch.load(
|
| 143 |
_, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 144 |
assert len(mm_u) == 0
|
| 145 |
-
|
| 146 |
if cfg.camera_adaptor_ckpt is not None:
|
| 147 |
-
camera_adaptor_checkpoint = torch.load(
|
| 148 |
camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 149 |
attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 150 |
camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
| 151 |
assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 152 |
_, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 153 |
assert len(attention_processor_u) == 0
|
| 154 |
-
|
| 155 |
pipeline = GenPhotoPipeline(
|
| 156 |
vae=vae,
|
| 157 |
text_encoder=text_encoder,
|
|
@@ -160,10 +149,12 @@ def load_models(cfg):
|
|
| 160 |
scheduler=noise_scheduler,
|
| 161 |
camera_encoder=camera_encoder
|
| 162 |
).to(device)
|
| 163 |
-
|
| 164 |
pipeline.enable_vae_slicing()
|
|
|
|
| 165 |
return pipeline, device
|
| 166 |
|
|
|
|
|
|
|
| 167 |
def run_inference(pipeline, tokenizer, text_encoder, base_scene, bokehK_list, device, video_length=5, height=256, width=384):
|
| 168 |
|
| 169 |
|
|
|
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def create_bokehK_embedding(bokehK_values, target_height, target_width):
|
| 26 |
f = bokehK_values.shape[0]
|
| 27 |
bokehK_embedding = torch.zeros((f, 3, target_height, target_width), dtype=bokehK_values.dtype)
|
|
|
|
| 89 |
camera_embedding = torch.cat((bokehK_embedding, ccl_embedding), dim=1)
|
| 90 |
return camera_embedding
|
| 91 |
|
| 92 |
+
|
| 93 |
def load_models(cfg):
|
| 94 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 97 |
+
vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
|
| 98 |
vae.requires_grad_(False)
|
| 99 |
+
tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
|
| 100 |
+
text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
|
|
|
|
| 101 |
text_encoder.requires_grad_(False)
|
|
|
|
| 102 |
unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 103 |
+
cfg.pretrained_model_path,
|
| 104 |
subfolder=cfg.unet_subfolder,
|
| 105 |
unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 106 |
).to(device)
|
|
|
|
| 121 |
)
|
| 122 |
|
| 123 |
if cfg.lora_ckpt is not None:
|
| 124 |
+
lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
|
| 125 |
if 'lora_state_dict' in lora_checkpoints.keys():
|
| 126 |
lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 127 |
_, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 128 |
assert len(lora_u) == 0
|
| 129 |
|
| 130 |
if cfg.motion_module_ckpt is not None:
|
| 131 |
+
mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
|
| 132 |
_, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 133 |
assert len(mm_u) == 0
|
| 134 |
+
|
| 135 |
if cfg.camera_adaptor_ckpt is not None:
|
| 136 |
+
camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
|
| 137 |
camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 138 |
attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 139 |
camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
| 140 |
assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 141 |
_, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 142 |
assert len(attention_processor_u) == 0
|
| 143 |
+
|
| 144 |
pipeline = GenPhotoPipeline(
|
| 145 |
vae=vae,
|
| 146 |
text_encoder=text_encoder,
|
|
|
|
| 149 |
scheduler=noise_scheduler,
|
| 150 |
camera_encoder=camera_encoder
|
| 151 |
).to(device)
|
|
|
|
| 152 |
pipeline.enable_vae_slicing()
|
| 153 |
+
|
| 154 |
return pipeline, device
|
| 155 |
|
| 156 |
+
|
| 157 |
+
|
| 158 |
def run_inference(pipeline, tokenizer, text_encoder, base_scene, bokehK_list, device, video_length=5, height=256, width=384):
|
| 159 |
|
| 160 |
|
inference_color_temperature.py
CHANGED
|
@@ -22,7 +22,6 @@ from genphoto.utils.util import save_videos_grid
|
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
| 25 |
-
from huggingface_hub import hf_hub_download
|
| 26 |
|
| 27 |
|
| 28 |
def kelvin_to_rgb(kelvin):
|
|
@@ -132,104 +131,19 @@ class Camera_Embedding(Dataset):
|
|
| 132 |
camera_embedding = torch.cat((color_temperature_embedding, ccl_embedding), dim=1)
|
| 133 |
return camera_embedding
|
| 134 |
|
| 135 |
-
#
|
| 136 |
-
# def load_models(cfg):
|
| 137 |
-
#
|
| 138 |
-
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 139 |
-
#
|
| 140 |
-
# noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 141 |
-
# vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
|
| 142 |
-
# vae.requires_grad_(False)
|
| 143 |
-
# tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
|
| 144 |
-
# text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
|
| 145 |
-
# text_encoder.requires_grad_(False)
|
| 146 |
-
# unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 147 |
-
# cfg.pretrained_model_path,
|
| 148 |
-
# subfolder=cfg.unet_subfolder,
|
| 149 |
-
# unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 150 |
-
# ).to(device)
|
| 151 |
-
# unet.requires_grad_(False)
|
| 152 |
-
#
|
| 153 |
-
# camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
|
| 154 |
-
# camera_encoder.requires_grad_(False)
|
| 155 |
-
# camera_adaptor = CameraAdaptor(unet, camera_encoder)
|
| 156 |
-
# camera_adaptor.requires_grad_(False)
|
| 157 |
-
# camera_adaptor.to(device)
|
| 158 |
-
#
|
| 159 |
-
# logger.info("Setting the attention processors")
|
| 160 |
-
# unet.set_all_attn_processor(
|
| 161 |
-
# add_spatial_lora=cfg.lora_ckpt is not None,
|
| 162 |
-
# add_motion_lora=cfg.motion_lora_rank > 0,
|
| 163 |
-
# lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
|
| 164 |
-
# motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
|
| 165 |
-
# **cfg.attention_processor_kwargs
|
| 166 |
-
# )
|
| 167 |
-
#
|
| 168 |
-
# if cfg.lora_ckpt is not None:
|
| 169 |
-
# print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
|
| 170 |
-
# lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
|
| 171 |
-
# if 'lora_state_dict' in lora_checkpoints.keys():
|
| 172 |
-
# lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 173 |
-
# _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 174 |
-
# assert len(lora_u) == 0
|
| 175 |
-
# print(f'Loading done')
|
| 176 |
-
#
|
| 177 |
-
# if cfg.motion_module_ckpt is not None:
|
| 178 |
-
# print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
|
| 179 |
-
# mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
|
| 180 |
-
# _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 181 |
-
# assert len(mm_u) == 0
|
| 182 |
-
# print("Loading done")
|
| 183 |
-
#
|
| 184 |
-
#
|
| 185 |
-
# if cfg.camera_adaptor_ckpt is not None:
|
| 186 |
-
# logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
|
| 187 |
-
# camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
|
| 188 |
-
# camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 189 |
-
# attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 190 |
-
# camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
| 191 |
-
#
|
| 192 |
-
# assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 193 |
-
# _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 194 |
-
# assert len(attention_processor_u) == 0
|
| 195 |
-
#
|
| 196 |
-
# logger.info("Camera Adaptor loading done")
|
| 197 |
-
# else:
|
| 198 |
-
# logger.info("No Camera Adaptor checkpoint used")
|
| 199 |
-
#
|
| 200 |
-
# pipeline = GenPhotoPipeline(
|
| 201 |
-
# vae=vae,
|
| 202 |
-
# text_encoder=text_encoder,
|
| 203 |
-
# tokenizer=tokenizer,
|
| 204 |
-
# unet=unet,
|
| 205 |
-
# scheduler=noise_scheduler,
|
| 206 |
-
# camera_encoder=camera_encoder
|
| 207 |
-
# ).to(device)
|
| 208 |
-
#
|
| 209 |
-
# pipeline.enable_vae_slicing()
|
| 210 |
-
#
|
| 211 |
-
# return pipeline, device
|
| 212 |
-
|
| 213 |
-
|
| 214 |
|
| 215 |
def load_models(cfg):
|
| 216 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 217 |
|
| 218 |
-
|
| 219 |
-
lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
|
| 220 |
-
motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
|
| 221 |
-
camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-color_temperature.ckpt")
|
| 222 |
|
| 223 |
noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 224 |
-
vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
|
| 225 |
vae.requires_grad_(False)
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
|
| 229 |
text_encoder.requires_grad_(False)
|
| 230 |
-
|
| 231 |
unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 232 |
-
pretrained_model_path,
|
| 233 |
subfolder=cfg.unet_subfolder,
|
| 234 |
unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 235 |
).to(device)
|
|
@@ -241,6 +155,7 @@ def load_models(cfg):
|
|
| 241 |
camera_adaptor.requires_grad_(False)
|
| 242 |
camera_adaptor.to(device)
|
| 243 |
|
|
|
|
| 244 |
unet.set_all_attn_processor(
|
| 245 |
add_spatial_lora=cfg.lora_ckpt is not None,
|
| 246 |
add_motion_lora=cfg.motion_lora_rank > 0,
|
|
@@ -250,25 +165,36 @@ def load_models(cfg):
|
|
| 250 |
)
|
| 251 |
|
| 252 |
if cfg.lora_ckpt is not None:
|
| 253 |
-
|
|
|
|
| 254 |
if 'lora_state_dict' in lora_checkpoints.keys():
|
| 255 |
lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 256 |
_, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 257 |
assert len(lora_u) == 0
|
|
|
|
| 258 |
|
| 259 |
if cfg.motion_module_ckpt is not None:
|
| 260 |
-
|
|
|
|
| 261 |
_, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 262 |
assert len(mm_u) == 0
|
|
|
|
|
|
|
| 263 |
|
| 264 |
if cfg.camera_adaptor_ckpt is not None:
|
| 265 |
-
|
|
|
|
| 266 |
camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 267 |
attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 268 |
camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
|
|
|
| 269 |
assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 270 |
_, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 271 |
assert len(attention_processor_u) == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
|
| 273 |
pipeline = GenPhotoPipeline(
|
| 274 |
vae=vae,
|
|
@@ -280,9 +206,8 @@ def load_models(cfg):
|
|
| 280 |
).to(device)
|
| 281 |
|
| 282 |
pipeline.enable_vae_slicing()
|
| 283 |
-
return pipeline, device
|
| 284 |
-
|
| 285 |
|
|
|
|
| 286 |
|
| 287 |
|
| 288 |
def run_inference(pipeline, tokenizer, text_encoder, base_scene, color_temperature_list, device, video_length=5, height=256, width=384):
|
|
|
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
def kelvin_to_rgb(kelvin):
|
|
|
|
| 131 |
camera_embedding = torch.cat((color_temperature_embedding, ccl_embedding), dim=1)
|
| 132 |
return camera_embedding
|
| 133 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
def load_models(cfg):
|
|
|
|
| 136 |
|
| 137 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 140 |
+
vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
|
| 141 |
vae.requires_grad_(False)
|
| 142 |
+
tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
|
| 143 |
+
text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
|
|
|
|
| 144 |
text_encoder.requires_grad_(False)
|
|
|
|
| 145 |
unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 146 |
+
cfg.pretrained_model_path,
|
| 147 |
subfolder=cfg.unet_subfolder,
|
| 148 |
unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 149 |
).to(device)
|
|
|
|
| 155 |
camera_adaptor.requires_grad_(False)
|
| 156 |
camera_adaptor.to(device)
|
| 157 |
|
| 158 |
+
logger.info("Setting the attention processors")
|
| 159 |
unet.set_all_attn_processor(
|
| 160 |
add_spatial_lora=cfg.lora_ckpt is not None,
|
| 161 |
add_motion_lora=cfg.motion_lora_rank > 0,
|
|
|
|
| 165 |
)
|
| 166 |
|
| 167 |
if cfg.lora_ckpt is not None:
|
| 168 |
+
print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
|
| 169 |
+
lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
|
| 170 |
if 'lora_state_dict' in lora_checkpoints.keys():
|
| 171 |
lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 172 |
_, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 173 |
assert len(lora_u) == 0
|
| 174 |
+
print(f'Loading done')
|
| 175 |
|
| 176 |
if cfg.motion_module_ckpt is not None:
|
| 177 |
+
print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
|
| 178 |
+
mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
|
| 179 |
_, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 180 |
assert len(mm_u) == 0
|
| 181 |
+
print("Loading done")
|
| 182 |
+
|
| 183 |
|
| 184 |
if cfg.camera_adaptor_ckpt is not None:
|
| 185 |
+
logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
|
| 186 |
+
camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
|
| 187 |
camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 188 |
attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 189 |
camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
| 190 |
+
|
| 191 |
assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 192 |
_, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 193 |
assert len(attention_processor_u) == 0
|
| 194 |
+
|
| 195 |
+
logger.info("Camera Adaptor loading done")
|
| 196 |
+
else:
|
| 197 |
+
logger.info("No Camera Adaptor checkpoint used")
|
| 198 |
|
| 199 |
pipeline = GenPhotoPipeline(
|
| 200 |
vae=vae,
|
|
|
|
| 206 |
).to(device)
|
| 207 |
|
| 208 |
pipeline.enable_vae_slicing()
|
|
|
|
|
|
|
| 209 |
|
| 210 |
+
return pipeline, device
|
| 211 |
|
| 212 |
|
| 213 |
def run_inference(pipeline, tokenizer, text_encoder, base_scene, color_temperature_list, device, video_length=5, height=256, width=384):
|
inference_focal_length.py
CHANGED
|
@@ -24,9 +24,6 @@ logger = logging.getLogger(__name__)
|
|
| 24 |
|
| 25 |
|
| 26 |
|
| 27 |
-
from huggingface_hub import hf_hub_download
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
| 31 |
def create_focal_length_embedding(focal_length_values, target_height, target_width, base_focal_length=24.0, sensor_height=24.0, sensor_width=36.0):
|
| 32 |
device = 'cpu'
|
|
@@ -137,101 +134,19 @@ class Camera_Embedding(Dataset):
|
|
| 137 |
camera_embedding = torch.cat((focal_length_embedding, ccl_embedding), dim=1)
|
| 138 |
return camera_embedding
|
| 139 |
|
| 140 |
-
#
|
| 141 |
-
# def load_models(cfg):
|
| 142 |
-
#
|
| 143 |
-
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 144 |
-
#
|
| 145 |
-
# noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 146 |
-
# vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
|
| 147 |
-
# vae.requires_grad_(False)
|
| 148 |
-
# tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
|
| 149 |
-
# text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
|
| 150 |
-
# text_encoder.requires_grad_(False)
|
| 151 |
-
# unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 152 |
-
# cfg.pretrained_model_path,
|
| 153 |
-
# subfolder=cfg.unet_subfolder,
|
| 154 |
-
# unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 155 |
-
# ).to(device)
|
| 156 |
-
# unet.requires_grad_(False)
|
| 157 |
-
#
|
| 158 |
-
# camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
|
| 159 |
-
# camera_encoder.requires_grad_(False)
|
| 160 |
-
# camera_adaptor = CameraAdaptor(unet, camera_encoder)
|
| 161 |
-
# camera_adaptor.requires_grad_(False)
|
| 162 |
-
# camera_adaptor.to(device)
|
| 163 |
-
#
|
| 164 |
-
# logger.info("Setting the attention processors")
|
| 165 |
-
# unet.set_all_attn_processor(
|
| 166 |
-
# add_spatial_lora=cfg.lora_ckpt is not None,
|
| 167 |
-
# add_motion_lora=cfg.motion_lora_rank > 0,
|
| 168 |
-
# lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
|
| 169 |
-
# motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
|
| 170 |
-
# **cfg.attention_processor_kwargs
|
| 171 |
-
# )
|
| 172 |
-
#
|
| 173 |
-
# if cfg.lora_ckpt is not None:
|
| 174 |
-
# print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
|
| 175 |
-
# lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
|
| 176 |
-
# if 'lora_state_dict' in lora_checkpoints.keys():
|
| 177 |
-
# lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 178 |
-
# _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 179 |
-
# assert len(lora_u) == 0
|
| 180 |
-
# print(f'Loading done')
|
| 181 |
-
#
|
| 182 |
-
# if cfg.motion_module_ckpt is not None:
|
| 183 |
-
# print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
|
| 184 |
-
# mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
|
| 185 |
-
# _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 186 |
-
# assert len(mm_u) == 0
|
| 187 |
-
# print("Loading done")
|
| 188 |
-
#
|
| 189 |
-
# if cfg.camera_adaptor_ckpt is not None:
|
| 190 |
-
# logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
|
| 191 |
-
# camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
|
| 192 |
-
# camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 193 |
-
# attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 194 |
-
# camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
| 195 |
-
#
|
| 196 |
-
# assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 197 |
-
# _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 198 |
-
# assert len(attention_processor_u) == 0
|
| 199 |
-
#
|
| 200 |
-
# logger.info("Camera Adaptor loading done")
|
| 201 |
-
# else:
|
| 202 |
-
# logger.info("No Camera Adaptor checkpoint used")
|
| 203 |
-
#
|
| 204 |
-
# pipeline = GenPhotoPipeline(
|
| 205 |
-
# vae=vae,
|
| 206 |
-
# text_encoder=text_encoder,
|
| 207 |
-
# tokenizer=tokenizer,
|
| 208 |
-
# unet=unet,
|
| 209 |
-
# scheduler=noise_scheduler,
|
| 210 |
-
# camera_encoder=camera_encoder
|
| 211 |
-
# ).to(device)
|
| 212 |
-
# pipeline.enable_vae_slicing()
|
| 213 |
-
#
|
| 214 |
-
# return pipeline, device
|
| 215 |
-
|
| 216 |
|
| 217 |
def load_models(cfg):
|
| 218 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 219 |
|
| 220 |
-
|
| 221 |
-
lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
|
| 222 |
-
motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
|
| 223 |
-
camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-focal_length.ckpt")
|
| 224 |
|
| 225 |
noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 226 |
-
vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
|
| 227 |
vae.requires_grad_(False)
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
|
| 231 |
text_encoder.requires_grad_(False)
|
| 232 |
-
|
| 233 |
unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 234 |
-
pretrained_model_path,
|
| 235 |
subfolder=cfg.unet_subfolder,
|
| 236 |
unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 237 |
).to(device)
|
|
@@ -243,6 +158,7 @@ def load_models(cfg):
|
|
| 243 |
camera_adaptor.requires_grad_(False)
|
| 244 |
camera_adaptor.to(device)
|
| 245 |
|
|
|
|
| 246 |
unet.set_all_attn_processor(
|
| 247 |
add_spatial_lora=cfg.lora_ckpt is not None,
|
| 248 |
add_motion_lora=cfg.motion_lora_rank > 0,
|
|
@@ -252,25 +168,35 @@ def load_models(cfg):
|
|
| 252 |
)
|
| 253 |
|
| 254 |
if cfg.lora_ckpt is not None:
|
| 255 |
-
|
|
|
|
| 256 |
if 'lora_state_dict' in lora_checkpoints.keys():
|
| 257 |
lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 258 |
_, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 259 |
assert len(lora_u) == 0
|
|
|
|
| 260 |
|
| 261 |
if cfg.motion_module_ckpt is not None:
|
| 262 |
-
|
|
|
|
| 263 |
_, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 264 |
assert len(mm_u) == 0
|
| 265 |
-
|
|
|
|
| 266 |
if cfg.camera_adaptor_ckpt is not None:
|
| 267 |
-
|
|
|
|
| 268 |
camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 269 |
attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 270 |
camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
|
|
|
| 271 |
assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 272 |
_, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 273 |
assert len(attention_processor_u) == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
|
| 275 |
pipeline = GenPhotoPipeline(
|
| 276 |
vae=vae,
|
|
@@ -280,10 +206,11 @@ def load_models(cfg):
|
|
| 280 |
scheduler=noise_scheduler,
|
| 281 |
camera_encoder=camera_encoder
|
| 282 |
).to(device)
|
| 283 |
-
|
| 284 |
pipeline.enable_vae_slicing()
|
|
|
|
| 285 |
return pipeline, device
|
| 286 |
|
|
|
|
| 287 |
def run_inference(pipeline, tokenizer, text_encoder, base_scene, focal_length_list, device, video_length=5, height=256, width=384):
|
| 288 |
|
| 289 |
focal_length_values = json.loads(focal_length_list)
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
def create_focal_length_embedding(focal_length_values, target_height, target_width, base_focal_length=24.0, sensor_height=24.0, sensor_width=36.0):
|
| 29 |
device = 'cpu'
|
|
|
|
| 134 |
camera_embedding = torch.cat((focal_length_embedding, ccl_embedding), dim=1)
|
| 135 |
return camera_embedding
|
| 136 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
def load_models(cfg):
|
|
|
|
| 139 |
|
| 140 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 143 |
+
vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
|
| 144 |
vae.requires_grad_(False)
|
| 145 |
+
tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
|
| 146 |
+
text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
|
|
|
|
| 147 |
text_encoder.requires_grad_(False)
|
|
|
|
| 148 |
unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 149 |
+
cfg.pretrained_model_path,
|
| 150 |
subfolder=cfg.unet_subfolder,
|
| 151 |
unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 152 |
).to(device)
|
|
|
|
| 158 |
camera_adaptor.requires_grad_(False)
|
| 159 |
camera_adaptor.to(device)
|
| 160 |
|
| 161 |
+
logger.info("Setting the attention processors")
|
| 162 |
unet.set_all_attn_processor(
|
| 163 |
add_spatial_lora=cfg.lora_ckpt is not None,
|
| 164 |
add_motion_lora=cfg.motion_lora_rank > 0,
|
|
|
|
| 168 |
)
|
| 169 |
|
| 170 |
if cfg.lora_ckpt is not None:
|
| 171 |
+
print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
|
| 172 |
+
lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
|
| 173 |
if 'lora_state_dict' in lora_checkpoints.keys():
|
| 174 |
lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 175 |
_, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 176 |
assert len(lora_u) == 0
|
| 177 |
+
print(f'Loading done')
|
| 178 |
|
| 179 |
if cfg.motion_module_ckpt is not None:
|
| 180 |
+
print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
|
| 181 |
+
mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
|
| 182 |
_, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 183 |
assert len(mm_u) == 0
|
| 184 |
+
print("Loading done")
|
| 185 |
+
|
| 186 |
if cfg.camera_adaptor_ckpt is not None:
|
| 187 |
+
logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
|
| 188 |
+
camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
|
| 189 |
camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 190 |
attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 191 |
camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
| 192 |
+
|
| 193 |
assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 194 |
_, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 195 |
assert len(attention_processor_u) == 0
|
| 196 |
+
|
| 197 |
+
logger.info("Camera Adaptor loading done")
|
| 198 |
+
else:
|
| 199 |
+
logger.info("No Camera Adaptor checkpoint used")
|
| 200 |
|
| 201 |
pipeline = GenPhotoPipeline(
|
| 202 |
vae=vae,
|
|
|
|
| 206 |
scheduler=noise_scheduler,
|
| 207 |
camera_encoder=camera_encoder
|
| 208 |
).to(device)
|
|
|
|
| 209 |
pipeline.enable_vae_slicing()
|
| 210 |
+
|
| 211 |
return pipeline, device
|
| 212 |
|
| 213 |
+
|
| 214 |
def run_inference(pipeline, tokenizer, text_encoder, base_scene, focal_length_list, device, video_length=5, height=256, width=384):
|
| 215 |
|
| 216 |
focal_length_values = json.loads(focal_length_list)
|
inference_shutter_speed.py
CHANGED
|
@@ -22,11 +22,6 @@ from genphoto.utils.util import save_videos_grid
|
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
| 25 |
-
|
| 26 |
-
from huggingface_hub import hf_hub_download
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
def create_shutter_speed_embedding(shutter_speed_values, target_height, target_width, base_exposure=0.5):
|
| 31 |
"""
|
| 32 |
Create a shutter_speed (Exposure Value or shutter speed) embedding tensor using a constant fwc value.
|
|
@@ -119,115 +114,32 @@ class Camera_Embedding(Dataset):
|
|
| 119 |
return camera_embedding
|
| 120 |
|
| 121 |
|
| 122 |
-
# def load_models(cfg):
|
| 123 |
-
#
|
| 124 |
-
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 125 |
-
#
|
| 126 |
-
# noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 127 |
-
# vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
|
| 128 |
-
# vae.requires_grad_(False)
|
| 129 |
-
# tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
|
| 130 |
-
# text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
|
| 131 |
-
# text_encoder.requires_grad_(False)
|
| 132 |
-
#
|
| 133 |
-
# unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 134 |
-
# cfg.pretrained_model_path,
|
| 135 |
-
# subfolder=cfg.unet_subfolder,
|
| 136 |
-
# unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 137 |
-
# ).to(device)
|
| 138 |
-
# unet.requires_grad_(False)
|
| 139 |
-
#
|
| 140 |
-
#
|
| 141 |
-
# camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
|
| 142 |
-
# camera_encoder.requires_grad_(False)
|
| 143 |
-
# camera_adaptor = CameraAdaptor(unet, camera_encoder)
|
| 144 |
-
# camera_adaptor.requires_grad_(False)
|
| 145 |
-
# camera_adaptor.to(device)
|
| 146 |
-
#
|
| 147 |
-
# logger.info("Setting the attention processors")
|
| 148 |
-
# unet.set_all_attn_processor(
|
| 149 |
-
# add_spatial_lora=cfg.lora_ckpt is not None,
|
| 150 |
-
# add_motion_lora=cfg.motion_lora_rank > 0,
|
| 151 |
-
# lora_kwargs={"lora_rank": cfg.lora_rank, "lora_scale": cfg.lora_scale},
|
| 152 |
-
# motion_lora_kwargs={"lora_rank": cfg.motion_lora_rank, "lora_scale": cfg.motion_lora_scale},
|
| 153 |
-
# **cfg.attention_processor_kwargs
|
| 154 |
-
# )
|
| 155 |
-
#
|
| 156 |
-
# if cfg.lora_ckpt is not None:
|
| 157 |
-
# print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
|
| 158 |
-
# lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
|
| 159 |
-
# if 'lora_state_dict' in lora_checkpoints.keys():
|
| 160 |
-
# lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 161 |
-
# _, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 162 |
-
# assert len(lora_u) == 0
|
| 163 |
-
# print(f'Loading done')
|
| 164 |
-
#
|
| 165 |
-
# if cfg.motion_module_ckpt is not None:
|
| 166 |
-
# print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
|
| 167 |
-
# mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
|
| 168 |
-
# _, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 169 |
-
# assert len(mm_u) == 0
|
| 170 |
-
# print("Loading done")
|
| 171 |
-
#
|
| 172 |
-
#
|
| 173 |
-
# if cfg.camera_adaptor_ckpt is not None:
|
| 174 |
-
# logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
|
| 175 |
-
# camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
|
| 176 |
-
#
|
| 177 |
-
# camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 178 |
-
# attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 179 |
-
#
|
| 180 |
-
# camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
| 181 |
-
#
|
| 182 |
-
# assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 183 |
-
# _, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 184 |
-
# assert len(attention_processor_u) == 0
|
| 185 |
-
#
|
| 186 |
-
# logger.info("Camera Adaptor loading done")
|
| 187 |
-
# else:
|
| 188 |
-
# logger.info("No Camera Adaptor checkpoint used")
|
| 189 |
-
#
|
| 190 |
-
# pipeline = GenPhotoPipeline(
|
| 191 |
-
# vae=vae,
|
| 192 |
-
# text_encoder=text_encoder,
|
| 193 |
-
# tokenizer=tokenizer,
|
| 194 |
-
# unet=unet,
|
| 195 |
-
# scheduler=noise_scheduler,
|
| 196 |
-
# camera_encoder=camera_encoder
|
| 197 |
-
# ).to(device)
|
| 198 |
-
# pipeline.enable_vae_slicing()
|
| 199 |
-
#
|
| 200 |
-
# return pipeline, device
|
| 201 |
-
|
| 202 |
def load_models(cfg):
|
| 203 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 204 |
|
| 205 |
-
|
| 206 |
-
lora_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/RealEstate10K_LoRA.ckpt")
|
| 207 |
-
motion_module_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/v3_sd15_mm.ckpt")
|
| 208 |
-
camera_adaptor_ckpt_path = hf_hub_download("pandaphd/generative_photography", "weights/checkpoint-shutter_speed.ckpt")
|
| 209 |
|
| 210 |
noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 211 |
-
vae = AutoencoderKL.from_pretrained(pretrained_model_path, subfolder="vae").to(device)
|
| 212 |
vae.requires_grad_(False)
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_path, subfolder="text_encoder").to(device)
|
| 216 |
text_encoder.requires_grad_(False)
|
| 217 |
|
| 218 |
unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 219 |
-
pretrained_model_path,
|
| 220 |
subfolder=cfg.unet_subfolder,
|
| 221 |
unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 222 |
).to(device)
|
| 223 |
unet.requires_grad_(False)
|
| 224 |
|
|
|
|
| 225 |
camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
|
| 226 |
camera_encoder.requires_grad_(False)
|
| 227 |
camera_adaptor = CameraAdaptor(unet, camera_encoder)
|
| 228 |
camera_adaptor.requires_grad_(False)
|
| 229 |
camera_adaptor.to(device)
|
| 230 |
|
|
|
|
| 231 |
unet.set_all_attn_processor(
|
| 232 |
add_spatial_lora=cfg.lora_ckpt is not None,
|
| 233 |
add_motion_lora=cfg.motion_lora_rank > 0,
|
|
@@ -237,25 +149,40 @@ def load_models(cfg):
|
|
| 237 |
)
|
| 238 |
|
| 239 |
if cfg.lora_ckpt is not None:
|
| 240 |
-
|
|
|
|
| 241 |
if 'lora_state_dict' in lora_checkpoints.keys():
|
| 242 |
lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 243 |
_, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 244 |
assert len(lora_u) == 0
|
|
|
|
| 245 |
|
| 246 |
if cfg.motion_module_ckpt is not None:
|
| 247 |
-
|
|
|
|
| 248 |
_, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 249 |
assert len(mm_u) == 0
|
|
|
|
|
|
|
| 250 |
|
|
|
|
| 251 |
if cfg.camera_adaptor_ckpt is not None:
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
| 253 |
camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 254 |
attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
|
|
|
| 255 |
camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
|
|
|
| 256 |
assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 257 |
_, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 258 |
assert len(attention_processor_u) == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
pipeline = GenPhotoPipeline(
|
| 261 |
vae=vae,
|
|
@@ -265,10 +192,9 @@ def load_models(cfg):
|
|
| 265 |
scheduler=noise_scheduler,
|
| 266 |
camera_encoder=camera_encoder
|
| 267 |
).to(device)
|
| 268 |
-
|
| 269 |
pipeline.enable_vae_slicing()
|
| 270 |
-
return pipeline, device
|
| 271 |
|
|
|
|
| 272 |
|
| 273 |
|
| 274 |
def run_inference(pipeline, tokenizer, text_encoder, base_scene, shutter_speed_list, device, video_length=5, height=256, width=384):
|
|
|
|
| 22 |
logging.basicConfig(level=logging.INFO)
|
| 23 |
logger = logging.getLogger(__name__)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
def create_shutter_speed_embedding(shutter_speed_values, target_height, target_width, base_exposure=0.5):
|
| 26 |
"""
|
| 27 |
Create a shutter_speed (Exposure Value or shutter speed) embedding tensor using a constant fwc value.
|
|
|
|
| 114 |
return camera_embedding
|
| 115 |
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
def load_models(cfg):
|
|
|
|
| 118 |
|
| 119 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
noise_scheduler = DDIMScheduler(**OmegaConf.to_container(cfg.noise_scheduler_kwargs))
|
| 122 |
+
vae = AutoencoderKL.from_pretrained(cfg.pretrained_model_path, subfolder="vae").to(device)
|
| 123 |
vae.requires_grad_(False)
|
| 124 |
+
tokenizer = CLIPTokenizer.from_pretrained(cfg.pretrained_model_path, subfolder="tokenizer")
|
| 125 |
+
text_encoder = CLIPTextModel.from_pretrained(cfg.pretrained_model_path, subfolder="text_encoder").to(device)
|
|
|
|
| 126 |
text_encoder.requires_grad_(False)
|
| 127 |
|
| 128 |
unet = UNet3DConditionModelCameraCond.from_pretrained_2d(
|
| 129 |
+
cfg.pretrained_model_path,
|
| 130 |
subfolder=cfg.unet_subfolder,
|
| 131 |
unet_additional_kwargs=cfg.unet_additional_kwargs
|
| 132 |
).to(device)
|
| 133 |
unet.requires_grad_(False)
|
| 134 |
|
| 135 |
+
|
| 136 |
camera_encoder = CameraCameraEncoder(**cfg.camera_encoder_kwargs).to(device)
|
| 137 |
camera_encoder.requires_grad_(False)
|
| 138 |
camera_adaptor = CameraAdaptor(unet, camera_encoder)
|
| 139 |
camera_adaptor.requires_grad_(False)
|
| 140 |
camera_adaptor.to(device)
|
| 141 |
|
| 142 |
+
logger.info("Setting the attention processors")
|
| 143 |
unet.set_all_attn_processor(
|
| 144 |
add_spatial_lora=cfg.lora_ckpt is not None,
|
| 145 |
add_motion_lora=cfg.motion_lora_rank > 0,
|
|
|
|
| 149 |
)
|
| 150 |
|
| 151 |
if cfg.lora_ckpt is not None:
|
| 152 |
+
print(f"Loading the lora checkpoint from {cfg.lora_ckpt}")
|
| 153 |
+
lora_checkpoints = torch.load(cfg.lora_ckpt, map_location=unet.device)
|
| 154 |
if 'lora_state_dict' in lora_checkpoints.keys():
|
| 155 |
lora_checkpoints = lora_checkpoints['lora_state_dict']
|
| 156 |
_, lora_u = unet.load_state_dict(lora_checkpoints, strict=False)
|
| 157 |
assert len(lora_u) == 0
|
| 158 |
+
print(f'Loading done')
|
| 159 |
|
| 160 |
if cfg.motion_module_ckpt is not None:
|
| 161 |
+
print(f"Loading the motion module checkpoint from {cfg.motion_module_ckpt}")
|
| 162 |
+
mm_checkpoints = torch.load(cfg.motion_module_ckpt, map_location=unet.device)
|
| 163 |
_, mm_u = unet.load_state_dict(mm_checkpoints, strict=False)
|
| 164 |
assert len(mm_u) == 0
|
| 165 |
+
print("Loading done")
|
| 166 |
+
|
| 167 |
|
| 168 |
+
# 🔥 加载 Camera Adaptor Checkpoint
|
| 169 |
if cfg.camera_adaptor_ckpt is not None:
|
| 170 |
+
logger.info(f"Loading camera adaptor from {cfg.camera_adaptor_ckpt}")
|
| 171 |
+
camera_adaptor_checkpoint = torch.load(cfg.camera_adaptor_ckpt, map_location=device)
|
| 172 |
+
|
| 173 |
+
# 加载 Camera Encoder
|
| 174 |
camera_encoder_state_dict = camera_adaptor_checkpoint['camera_encoder_state_dict']
|
| 175 |
attention_processor_state_dict = camera_adaptor_checkpoint['attention_processor_state_dict']
|
| 176 |
+
|
| 177 |
camera_enc_m, camera_enc_u = camera_adaptor.camera_encoder.load_state_dict(camera_encoder_state_dict, strict=False)
|
| 178 |
+
|
| 179 |
assert len(camera_enc_m) == 0 and len(camera_enc_u) == 0
|
| 180 |
_, attention_processor_u = camera_adaptor.unet.load_state_dict(attention_processor_state_dict, strict=False)
|
| 181 |
assert len(attention_processor_u) == 0
|
| 182 |
+
|
| 183 |
+
logger.info("Camera Adaptor loading done")
|
| 184 |
+
else:
|
| 185 |
+
logger.info("No Camera Adaptor checkpoint used")
|
| 186 |
|
| 187 |
pipeline = GenPhotoPipeline(
|
| 188 |
vae=vae,
|
|
|
|
| 192 |
scheduler=noise_scheduler,
|
| 193 |
camera_encoder=camera_encoder
|
| 194 |
).to(device)
|
|
|
|
| 195 |
pipeline.enable_vae_slicing()
|
|
|
|
| 196 |
|
| 197 |
+
return pipeline, device
|
| 198 |
|
| 199 |
|
| 200 |
def run_inference(pipeline, tokenizer, text_encoder, base_scene, shutter_speed_list, device, video_length=5, height=256, width=384):
|
requirements.txt
CHANGED
|
@@ -2,18 +2,18 @@
|
|
| 2 |
torch==2.1.1
|
| 3 |
torchvision==0.16.1
|
| 4 |
torchaudio==2.1.1
|
| 5 |
-
diffusers
|
| 6 |
imageio==2.36.0
|
| 7 |
imageio-ffmpeg
|
| 8 |
-
transformers
|
| 9 |
-
accelerate
|
| 10 |
opencv-python
|
| 11 |
gdown
|
| 12 |
einops
|
| 13 |
decord
|
| 14 |
omegaconf
|
| 15 |
safetensors
|
| 16 |
-
gradio
|
| 17 |
wandb
|
| 18 |
triton
|
| 19 |
-
huggingface_hub
|
|
|
|
| 2 |
torch==2.1.1
|
| 3 |
torchvision==0.16.1
|
| 4 |
torchaudio==2.1.1
|
| 5 |
+
diffusers==0.24.0
|
| 6 |
imageio==2.36.0
|
| 7 |
imageio-ffmpeg
|
| 8 |
+
transformers==4.45.2
|
| 9 |
+
accelerate==1.0.1
|
| 10 |
opencv-python
|
| 11 |
gdown
|
| 12 |
einops
|
| 13 |
decord
|
| 14 |
omegaconf
|
| 15 |
safetensors
|
| 16 |
+
gradio==5.1.0
|
| 17 |
wandb
|
| 18 |
triton
|
| 19 |
+
huggingface_hub==0.25.2
|