Muhammad Taqi Raza commited on
Commit
ba201a1
·
1 Parent(s): 15db18d

adding options

Browse files
gradio_app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import os
3
  import subprocess
4
  from datetime import datetime
@@ -6,165 +5,230 @@ from pathlib import Path
6
  import gradio as gr
7
  import numpy as np
8
 
9
- # -----------------------------
10
- # Setup paths and env
11
- # -----------------------------
12
- HF_HOME = "/app/hf_cache"
13
- os.environ["HF_HOME"] = HF_HOME
14
- os.environ["TRANSFORMERS_CACHE"] = HF_HOME
15
- os.makedirs(HF_HOME, exist_ok=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- PRETRAINED_DIR = "/app/pretrained"
18
- os.makedirs(PRETRAINED_DIR, exist_ok=True)
19
-
20
- # -----------------------------
21
- # Step 1: Optional Model Download
22
- # -----------------------------
23
- def download_models():
24
- expected_model = os.path.join(PRETRAINED_DIR, "RAFT/raft-things.pth")
25
- if not Path(expected_model).exists():
26
- print("⚙️ Downloading pretrained models...")
27
- try:
28
- subprocess.check_call(["bash", "download/download_models.sh"])
29
- print("✅ Models downloaded.")
30
- except subprocess.CalledProcessError as e:
31
- print(f"❌ Model download failed: {e}")
32
- else:
33
- print("✅ Pretrained models already exist.")
34
-
35
- download_models()
36
 
37
  # -----------------------------
38
- # Step 2: Inference Logic
39
  # -----------------------------
40
- def estimate_near_far(depths, lower_percentile=5, upper_percentile=95):
41
- flat = depths.flatten()
42
- near = np.percentile(flat, lower_percentile)
43
- far = np.percentile(flat, upper_percentile)
44
- return near, far
 
45
 
46
- def run_epic_inference(video_path, fps, num_frames, target_pose, mode):
47
  temp_input_path = "/app/temp_input.mp4"
48
  output_dir = "/app/output_anchor"
49
  video_output_path = f"{output_dir}/masked_videos/output.mp4"
50
 
51
- # Save uploaded video
52
  if video_path:
53
  os.system(f"cp '{video_path}' {temp_input_path}")
54
 
55
  try:
56
  theta, phi, r, x, y = target_pose.strip().split()
57
  except ValueError:
58
- return f"Invalid target pose format. Use: θ φ r x y", None
59
  logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
 
 
60
  command = [
61
  "python", "/app/inference/v2v_data/inference.py",
62
  "--video_path", temp_input_path,
63
  "--stride", "1",
64
  "--out_dir", output_dir,
65
- "--radius_scale", "1",
66
  "--camera", "target",
67
  "--mask",
68
  "--target_pose", theta, phi, r, x, y,
69
  "--video_length", str(num_frames),
70
  "--save_name", "output",
71
  "--mode", mode,
72
- "--fps", str(fps)
73
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  try:
75
  result = subprocess.run(command, capture_output=True, text=True, check=True)
76
  logs += result.stdout
77
  except subprocess.CalledProcessError as e:
78
- logs += f"❌ Inference failed:\n{e.stderr}{e.stdout}"
79
- return logs, None
80
-
81
- return logs + result.stdout, str(video_output_path) if os.path.exists(video_output_path) else (logs, None)
82
-
83
- def print_output_directory(out_dir):
84
- result = ""
85
- for root, dirs, files in os.walk(out_dir):
86
- level = root.replace(out_dir, '').count(os.sep)
87
- indent = ' ' * 4 * level
88
- result += f"{indent}{os.path.basename(root)}/\n"
89
- sub_indent = ' ' * 4 * (level + 1)
90
- for f in files:
91
- result += f"{sub_indent}{f}\n"
92
- return result
93
-
94
- def inference(video_path, num_frames, fps, target_pose, mode):
95
- logs, video_masked = run_epic_inference(video_path, fps, num_frames, target_pose, mode)
96
- # return logs, video_masked, video_masked
97
- result_dir = print_output_directory("/app/output_anchor")
98
-
99
 
 
 
 
 
 
 
 
 
 
 
100
  MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
101
- ckpt_steps = 500
102
- ckpt_dir = "/app/out/EPiC_pretrained"
103
- ckpt_file = f"checkpoint-{ckpt_steps}.pt"
104
- ckpt_path = f"{ckpt_dir}/{ckpt_file}"
105
-
106
  video_root_dir = "/app/output_anchor"
107
  out_dir = "/app/output"
108
 
109
-
110
  command = [
111
  "python", "/app/inference/cli_demo_camera_i2v_pcd.py",
112
  "--video_root_dir", video_root_dir,
113
  "--base_model_path", MODEL_PATH,
114
  "--controlnet_model_path", ckpt_path,
115
  "--output_path", out_dir,
116
- "--start_camera_idx", "0",
117
- "--end_camera_idx", "8",
118
- "--controlnet_weights", "1.0",
119
- "--controlnet_guidance_start", "0.0",
120
- "--controlnet_guidance_end", "0.4",
121
- "--controlnet_input_channels", "3",
122
- "--controlnet_transformer_num_attn_heads", "4",
123
- "--controlnet_transformer_attention_head_dim", "64",
124
- "--controlnet_transformer_out_proj_dim_factor", "64",
125
- "--controlnet_transformer_out_proj_dim_zero_init",
126
- "--vae_channels", "16",
127
  "--num_frames", str(num_frames),
128
- "--controlnet_transformer_num_layers", "8",
129
- "--infer_with_mask",
130
- "--pool_style", "max",
131
- "--seed", "43",
132
- "--fps", str(fps)
133
  ]
134
 
135
  result = subprocess.run(command, capture_output=True, text=True)
136
- logs += "\n" + result.stdout
137
- result_dir = print_output_directory(out_dir)
138
- if result.returncode == 0:
139
- logs += "Inference completed successfully."
140
- else:
141
- logs += f"Error occurred during inference: {result.stderr}"
142
 
143
- return logs + result_dir + "Hello! it is successful", str(f"{out_dir}/00000_43_out.mp4"), video_masked
144
 
145
  # -----------------------------
146
- # Step 3: Create Gradio UI
147
  # -----------------------------
148
- demo = gr.Interface(
149
- fn=inference,
150
- inputs=[
151
- gr.Video(label="Upload Video (MP4)"),
152
- gr.Slider(minimum=1, maximum=120, value=50, step=1, label="Number of Frames"),
153
- gr.Slider(minimum=1, maximum=90, value=10, step=1, label="FPS"),
154
- gr.Textbox(label="Target Pose φ r x y)", placeholder="e.g., 0 30 -0.6 0 0"),
155
- gr.Dropdown(choices=["gradual", "direct", "bullet"], value="gradual", label="Camera Mode"),
156
- ],
157
- outputs=[
158
- gr.Textbox(label="Inference Logs"),
159
- gr.Video(label="Generated Video`"),
160
- gr.Video(label="Masked Video")
161
- ],
162
- title="🎬 EPiC: Efficient Video Camera Control",
163
- description="Upload a video, describe the scene, and apply cinematic camera motion using pretrained EPiC models.",
164
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- # -----------------------------
167
- # Step 4: Launch App
168
- # -----------------------------
169
  if __name__ == "__main__":
170
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
1
  import os
2
  import subprocess
3
  from datetime import datetime
 
5
  import gradio as gr
6
  import numpy as np
7
 
8
+ # # -----------------------------
9
+ # # Setup paths and env
10
+ # # -----------------------------
11
+ # HF_HOME = "/app/hf_cache"
12
+ # os.environ["HF_HOME"] = HF_HOME
13
+ # os.environ["TRANSFORMERS_CACHE"] = HF_HOME
14
+ # os.makedirs(HF_HOME, exist_ok=True)
15
+
16
+ # PRETRAINED_DIR = "/app/pretrained"
17
+ # os.makedirs(PRETRAINED_DIR, exist_ok=True)
18
+
19
+ # # -----------------------------
20
+ # # Step 1: Optional Model Download
21
+ # # -----------------------------
22
+ # def download_models():
23
+ # expected_model = os.path.join(PRETRAINED_DIR, "RAFT/raft-things.pth")
24
+ # if not Path(expected_model).exists():
25
+ # print("⚙️ Downloading pretrained models...")
26
+ # try:
27
+ # subprocess.check_call(["bash", "download/download_models.sh"])
28
+ # print("✅ Models downloaded.")
29
+ # except subprocess.CalledProcessError as e:
30
+ # print(f"❌ Model download failed: {e}")
31
+ # else:
32
+ # print("✅ Pretrained models already exist.")
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  # -----------------------------
36
+ # Step 1: Get Anchor Video
37
  # -----------------------------
38
+ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
39
+ radius_scale, near_far_estimated,
40
+ sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
41
+ prompt, negative_prompt, refine_prompt,
42
+ depth_inference_steps, depth_guidance_scale,
43
+ window_size, overlap, max_res, sample_size, seed_input, height, width):
44
 
 
45
  temp_input_path = "/app/temp_input.mp4"
46
  output_dir = "/app/output_anchor"
47
  video_output_path = f"{output_dir}/masked_videos/output.mp4"
48
 
 
49
  if video_path:
50
  os.system(f"cp '{video_path}' {temp_input_path}")
51
 
52
  try:
53
  theta, phi, r, x, y = target_pose.strip().split()
54
  except ValueError:
55
+ return f"Invalid target pose format. Use: θ φ r x y", None, None
56
  logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
57
+
58
+ # INTEGRATE HEIGHT AND WIDTH PARAMETERS
59
  command = [
60
  "python", "/app/inference/v2v_data/inference.py",
61
  "--video_path", temp_input_path,
62
  "--stride", "1",
63
  "--out_dir", output_dir,
64
+ "--radius_scale", str(radius_scale),
65
  "--camera", "target",
66
  "--mask",
67
  "--target_pose", theta, phi, r, x, y,
68
  "--video_length", str(num_frames),
69
  "--save_name", "output",
70
  "--mode", mode,
71
+ "--fps", str(fps),
72
+ "--depth_inference_steps", str(depth_inference_steps),
73
+ "--depth_guidance_scale", str(depth_guidance_scale),
74
+ "--near_far_estimated", near_far_estimated,
75
+ "--sampler_name", sampler_name,
76
+ "--diffusion_guidance_scale", str(diffusion_guidance_scale),
77
+ "--diffusion_inference_steps", str(diffusion_inference_steps),
78
+ "--prompt", prompt if prompt else "",
79
+ "--negative_prompt", negative_prompt,
80
+ "--refine_prompt", refine_prompt,
81
+ "--window_size", str(window_size),
82
+ "--overlap", str(overlap),
83
+ "--max_res", str(max_res),
84
+ "--sample_size", sample_size if sample_size else "384, 672",
85
+ "--seed", seed_input,
86
+ "--height", str(height), # Fixed height
87
+ "--width", str(width)
88
+ ]
89
+
90
  try:
91
  result = subprocess.run(command, capture_output=True, text=True, check=True)
92
  logs += result.stdout
93
  except subprocess.CalledProcessError as e:
94
+ logs += f"❌ Inference failed:\n{e.stderr}{e.stdout}"
95
+ return None, logs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
+ return str(video_output_path), logs
98
+ # -----------------------------
99
+ # Step 2: Run Inference
100
+ # -----------------------------
101
+ def inference(
102
+ fps, num_frames, controlnet_weights, controlnet_guidance_start,
103
+ controlnet_guidance_end, guidance_scale, num_inference_steps, dtype,
104
+ seed, height, width, downscale_coef, vae_channels,
105
+ controlnet_input_channels, controlnet_transformer_num_layers
106
+ ):
107
  MODEL_PATH = "/app/pretrained/CogVideoX-5b-I2V"
108
+ ckpt_path = "/app/out/EPiC_pretrained/checkpoint-500.pt"
 
 
 
 
109
  video_root_dir = "/app/output_anchor"
110
  out_dir = "/app/output"
111
 
 
112
  command = [
113
  "python", "/app/inference/cli_demo_camera_i2v_pcd.py",
114
  "--video_root_dir", video_root_dir,
115
  "--base_model_path", MODEL_PATH,
116
  "--controlnet_model_path", ckpt_path,
117
  "--output_path", out_dir,
118
+ "--controlnet_weights", str(controlnet_weights),
119
+ "--controlnet_guidance_start", str(controlnet_guidance_start),
120
+ "--controlnet_guidance_end", str(controlnet_guidance_end),
121
+ "--guidance_scale", str(guidance_scale),
122
+ "--num_inference_steps", str(num_inference_steps),
123
+ "--dtype", dtype,
124
+ "--seed", str(seed),
125
+ "--height", str(height),
126
+ "--width", str(width),
 
 
127
  "--num_frames", str(num_frames),
128
+ "--fps", str(fps),
129
+ "--downscale_coef", str(downscale_coef),
130
+ "--vae_channels", str(vae_channels),
131
+ "--controlnet_input_channels", str(controlnet_input_channels),
132
+ "--controlnet_transformer_num_layers", str(controlnet_transformer_num_layers)
133
  ]
134
 
135
  result = subprocess.run(command, capture_output=True, text=True)
136
+ logs = result.stdout
137
+ video_output = f"{out_dir}/00000_{seed}_out.mp4"
138
+ return video_output if os.path.exists(video_output) else None, logs
 
 
 
139
 
 
140
 
141
  # -----------------------------
142
+ # UI
143
  # -----------------------------
144
+ demo = gr.Blocks()
145
+
146
+ with demo:
147
+ gr.Markdown("## 🎬 EPiC: Cinematic Camera Control")
148
+
149
+ with gr.Tabs():
150
+ with gr.TabItem("Step 1: Camera Anchor"):
151
+ with gr.Row():
152
+ with gr.Column():
153
+ with gr.Row():
154
+ near_far_estimated = gr.Checkbox(label="Near Far Estimation", value=True) # integrate it with
155
+ pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
156
+ fps_input = gr.Number(value=24, label="FPS")
157
+ num_frames_input = gr.Number(value=49, label="Number of Frames")
158
+ radius_input = gr.Number(value = 1.0, label="Radius Scale")
159
+ mode_input = gr.Dropdown(choices=["gradual", "direct", "bullet"], value="gradual", label="Camera Mode")
160
+ sampler_input = gr.Dropdown(choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"], value="DDIM_Origin", label="Sampler")
161
+ diff_guidance_input = gr.Number(value=6.0, label="Diffusion Guidance")
162
+ diff_steps_input = gr.Number(value=50, label="Diffusion Steps")
163
+ depth_steps_input = gr.Number(value=5, label="Depth Steps")
164
+ depth_guidance_input = gr.Number(value=1.0, label="Depth Guidance")
165
+ window_input = gr.Number(value=64, label="Window Size")
166
+ overlap_input = gr.Number(value=25, label="Overlap")
167
+ maxres_input = gr.Number(value=1024, label="Max Resolution")
168
+ sample_size = gr.Textbox(label="Sample Size (height, width)", placeholder="e.g., 384, 672", value="384, 672")
169
+ seed_input = gr.Number(value=43, label="Seed")
170
+ height = gr.Number(value=576, label="Height")
171
+ width = gr.Number(value=1024, label="Width")
172
+ prompt_input = gr.Textbox(label="Prompt")
173
+ neg_prompt_input = gr.Textbox(label="Negative Prompt", value="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory.")
174
+ refine_prompt_input = gr.Textbox(label="Refine Prompt", value=" The video is of high quality, and the view is very clear. ")
175
+ with gr.Column():
176
+ video_input = gr.Video(label="Upload Video (MP4)")
177
+ step1_button = gr.Button("▶️ Run Step 1")
178
+ step1_video = gr.Video(label="[Step 1] Masked Video")
179
+ step1_logs = gr.Textbox(label="[Step 1] Logs")
180
+
181
+ with gr.TabItem("Step 2: CogVideoX Refinement"):
182
+ with gr.Row():
183
+ with gr.Column():
184
+ with gr.Row():
185
+ controlnet_weights_input = gr.Number(value=0.5, label="ControlNet Weights")
186
+ controlnet_guidance_start_input = gr.Number(value=0.0, label="Guidance Start")
187
+ controlnet_guidance_end_input = gr.Number(value=0.5, label="Guidance End")
188
+ guidance_scale_input = gr.Number(value=6.0, label="Guidance Scale")
189
+ inference_steps_input = gr.Number(value=50, label="Num Inference Steps")
190
+ dtype_input = gr.Dropdown(choices=["float16", "bfloat16"], value="bfloat16", label="Compute Dtype")
191
+ seed_input2 = gr.Number(value=42, label="Seed")
192
+ height_input = gr.Number(value=480, label="Height")
193
+ width_input = gr.Number(value=720, label="Width")
194
+ num_frames_input2 = gr.Number(value=97, label="Num Frames")
195
+ fps_input2 = gr.Number(value=8, label="FPS")
196
+ downscale_coef_input = gr.Number(value=8, label="Downscale Coef")
197
+ vae_channels_input = gr.Number(value=16, label="VAE Channels")
198
+ controlnet_input_channels_input = gr.Number(value=6, label="ControlNet Input Channels")
199
+ controlnet_layers_input = gr.Number(value=8, label="ControlNet Transformer Layers")
200
+ with gr.Column():
201
+ step2_video = gr.Video(label="[Step 2] Final Refined Video")
202
+ step2_button = gr.Button("▶️ Run Step 2")
203
+ step2_logs = gr.Textbox(label="[Step 2] Logs")
204
+
205
+
206
+ step1_button.click(
207
+ get_anchor_video,
208
+ inputs=[
209
+ video_input, fps_input, num_frames_input, pose_input, mode_input,
210
+ radius_input, near_far_estimated,
211
+ sampler_input, diff_guidance_input, diff_steps_input,
212
+ prompt_input, neg_prompt_input, refine_prompt_input,
213
+ depth_steps_input, depth_guidance_input,
214
+ window_input, overlap_input, maxres_input, sample_size, seed_input, height, width
215
+ ],
216
+ outputs=[step1_video, step1_logs]
217
+ )
218
+ step2_button.click(
219
+ inference,
220
+ inputs=[
221
+ fps_input2, num_frames_input2,
222
+ controlnet_weights_input, controlnet_guidance_start_input,
223
+ controlnet_guidance_end_input, guidance_scale_input,
224
+ inference_steps_input, dtype_input, seed_input2,
225
+ height_input, width_input, downscale_coef_input,
226
+ vae_channels_input, controlnet_input_channels_input,
227
+ controlnet_layers_input
228
+ ],
229
+ outputs=[step2_video, step2_logs]
230
+ )
231
 
 
 
 
232
  if __name__ == "__main__":
233
+ # download_models()
234
+ demo.launch(server_name="0.0.0.0", server_port=7860)
inference/v2v_data/demo.py CHANGED
@@ -24,8 +24,8 @@ def get_center_crop_resolution(original_resoultion, target_aspect_ratio=(2, 3)):
24
  crop_w = original_w
25
  crop_h = int(crop_w / aspect_ratio)
26
 
27
- resized_h = 576
28
- resized_w = 1024
29
 
30
  h_ratio = resized_h / original_h
31
  w_ratio = resized_w / original_w
@@ -111,7 +111,7 @@ class GetAnchorVideos:
111
 
112
  def infer_gradual(self, opts):
113
  frames = read_video_frames(
114
- opts.video_path, opts.video_length, opts.stride, opts.max_res
115
  )
116
  vr = VideoReader(opts.video_path, ctx=cpu(0))
117
  frame_shape = vr[0].shape # (H, W, 3)
 
24
  crop_w = original_w
25
  crop_h = int(crop_w / aspect_ratio)
26
 
27
+ resized_h = original_resoultion[0] # previous 576
28
+ resized_w = original_resoultion[1] # previous 1024
29
 
30
  h_ratio = resized_h / original_h
31
  w_ratio = resized_w / original_w
 
111
 
112
  def infer_gradual(self, opts):
113
  frames = read_video_frames(
114
+ opts.video_path, opts.video_length, opts.stride, opts.max_res, opts.height, opts.width
115
  )
116
  vr = VideoReader(opts.video_path, ctx=cpu(0))
117
  frame_shape = vr[0].shape # (H, W, 3)
inference/v2v_data/inference.py CHANGED
@@ -32,7 +32,7 @@ def get_parser():
32
  '--seed', type=int, default=43, help='Random seed for reproducibility'
33
  )
34
  parser.add_argument(
35
- '--video_length', type=int, default=97, help='Length of the video frames'
36
  )
37
  parser.add_argument('--fps', type=int, default=10, help='Fps for saved video')
38
  parser.add_argument(
@@ -48,6 +48,7 @@ def get_parser():
48
  help='Scale factor for the spherical radius',
49
  )
50
  parser.add_argument('--camera', type=str, default='traj', help='traj or target')
 
51
  parser.add_argument(
52
  '--mode', type=str, default='gradual', help='gradual, bullet or direct'
53
  )
@@ -71,8 +72,21 @@ def get_parser():
71
  parser.add_argument(
72
  '--far', type=float, default=10000.0, help='Far clipping plane distance'
73
  )
 
 
 
 
 
 
 
 
74
  parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame')
75
-
 
 
 
 
 
76
  ## diffusion
77
  parser.add_argument(
78
  '--low_gpu_memory_mode',
@@ -80,6 +94,7 @@ def get_parser():
80
  default=False,
81
  help='Enable low GPU memory mode',
82
  )
 
83
  # parser.add_argument('--model_name', type=str, default='checkpoints/CogVideoX-Fun-V1.1-5b-InP', help='Path to the model')
84
  parser.add_argument(
85
  '--model_name',
@@ -126,13 +141,13 @@ def get_parser():
126
  parser.add_argument(
127
  '--negative_prompt',
128
  type=str,
129
- default="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion.",
130
  help='Negative prompt for video generation',
131
  )
132
  parser.add_argument(
133
  '--refine_prompt',
134
  type=str,
135
- default=". The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
136
  help='Prompt for video generation',
137
  )
138
  parser.add_argument('--qwen_path', type=str, default="/app/pretrained/Qwen2.5-VL-7B-Instruct")
 
32
  '--seed', type=int, default=43, help='Random seed for reproducibility'
33
  )
34
  parser.add_argument(
35
+ '--video_length', type=int, default=49, help='Length of the video frames'
36
  )
37
  parser.add_argument('--fps', type=int, default=10, help='Fps for saved video')
38
  parser.add_argument(
 
48
  help='Scale factor for the spherical radius',
49
  )
50
  parser.add_argument('--camera', type=str, default='traj', help='traj or target')
51
+
52
  parser.add_argument(
53
  '--mode', type=str, default='gradual', help='gradual, bullet or direct'
54
  )
 
72
  parser.add_argument(
73
  '--far', type=float, default=10000.0, help='Far clipping plane distance'
74
  )
75
+ parser.add_argument(
76
+ '--height', type=int, default=1080, help='Height'
77
+ )
78
+ parser.add_argument(
79
+ '--width', type=int, default=1920, help='width'
80
+ )
81
+
82
+
83
  parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame')
84
+ parser.add_argument(
85
+ '--near_far_estimated',
86
+ type=bool,
87
+ default=True,
88
+ help='Use estimated near and far values',
89
+ )
90
  ## diffusion
91
  parser.add_argument(
92
  '--low_gpu_memory_mode',
 
94
  default=False,
95
  help='Enable low GPU memory mode',
96
  )
97
+
98
  # parser.add_argument('--model_name', type=str, default='checkpoints/CogVideoX-Fun-V1.1-5b-InP', help='Path to the model')
99
  parser.add_argument(
100
  '--model_name',
 
141
  parser.add_argument(
142
  '--negative_prompt',
143
  type=str,
144
+ default="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid.",
145
  help='Negative prompt for video generation',
146
  )
147
  parser.add_argument(
148
  '--refine_prompt',
149
  type=str,
150
+ default=". The video is of high quality, and the view is very clear. ",
151
  help='Prompt for video generation',
152
  )
153
  parser.add_argument('--qwen_path', type=str, default="/app/pretrained/Qwen2.5-VL-7B-Instruct")
inference/v2v_data/models/utils.py CHANGED
@@ -28,7 +28,7 @@ from decord import VideoReader, cpu
28
 
29
  from PIL import Image
30
 
31
- def read_video_frames(video_path, process_length, stride, max_res, dataset="open"):
32
  def is_image(path):
33
  return any(path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp'])
34
 
@@ -36,8 +36,8 @@ def read_video_frames(video_path, process_length, stride, max_res, dataset="open
36
  print("==> Detected image. Loading as single-frame video:", video_path)
37
  img = Image.open(video_path).convert("RGB")
38
  # FIXME: hard coded
39
- width = 1024
40
- height = 576
41
  img = img.resize((width, height), Image.BICUBIC)
42
  img = np.array(img).astype("float32") / 255.0 # [H, W, 3]
43
  frames = img[None, ...] # [1, H, W, 3]
@@ -50,8 +50,8 @@ def read_video_frames(video_path, process_length, stride, max_res, dataset="open
50
  print("==> original video shape:", (len(vid), *vid.get_batch([0]).shape[1:]))
51
 
52
  # FIXME: hard coded
53
- width = 1024
54
- height = 576
55
 
56
  vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
57
 
 
28
 
29
  from PIL import Image
30
 
31
+ def read_video_frames(video_path, process_length, stride, max_res, dataset="open", height=576, width=1024):
32
  def is_image(path):
33
  return any(path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp'])
34
 
 
36
  print("==> Detected image. Loading as single-frame video:", video_path)
37
  img = Image.open(video_path).convert("RGB")
38
  # FIXME: hard coded
39
+ width = width
40
+ height = height
41
  img = img.resize((width, height), Image.BICUBIC)
42
  img = np.array(img).astype("float32") / 255.0 # [H, W, 3]
43
  frames = img[None, ...] # [1, H, W, 3]
 
50
  print("==> original video shape:", (len(vid), *vid.get_batch([0]).shape[1:]))
51
 
52
  # FIXME: hard coded
53
+ width = width
54
+ height = height
55
 
56
  vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
57