Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,19 +4,24 @@ from PIL import Image
|
|
| 4 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 5 |
import torch
|
| 6 |
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
| 7 |
-
from moviepy.editor import
|
|
|
|
| 8 |
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
| 9 |
import scipy.io.wavfile
|
| 10 |
import re
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# 定义图像到文本函数
|
| 14 |
def img2text(image):
|
| 15 |
-
processor = BlipProcessor.from_pretrained("
|
| 16 |
-
model = BlipForConditionalGeneration.from_pretrained("
|
| 17 |
inputs = processor(image, return_tensors="pt")
|
| 18 |
out = model.generate(**inputs)
|
| 19 |
caption = processor.decode(out[0], skip_special_tokens=True)
|
|
|
|
| 20 |
return caption
|
| 21 |
|
| 22 |
# 定义文本生成函数
|
|
@@ -58,20 +63,84 @@ def text2text(user_input):
|
|
| 58 |
response = requests.post(f"{base_url}/chat/completions", headers=headers, json=data)
|
| 59 |
response.raise_for_status()
|
| 60 |
completion = response.json()
|
|
|
|
| 61 |
return completion['choices'][0]['message']['content']
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
# 定义文本到视频函数
|
| 64 |
def text2vid(input_text):
|
| 65 |
sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
|
| 66 |
-
adapter = MotionAdapter.from_pretrained("
|
| 67 |
-
pipe = AnimateDiffPipeline.from_pretrained("
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
for sentence in sentences:
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
def text2text_A(user_input):
|
| 77 |
# 设置API密钥和基础URL
|
|
@@ -107,13 +176,13 @@ def text2text_A(user_input):
|
|
| 107 |
|
| 108 |
# 定义文本到音频函数
|
| 109 |
def text2audio(text_input, duration_seconds):
|
| 110 |
-
processor = AutoProcessor.from_pretrained("
|
| 111 |
-
model = MusicgenForConditionalGeneration.from_pretrained("
|
| 112 |
inputs = processor(text=[text_input], padding=True, return_tensors="pt")
|
| 113 |
max_new_tokens = int((duration_seconds / 5) * 256)
|
| 114 |
audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
| 115 |
-
|
| 116 |
-
return
|
| 117 |
|
| 118 |
# 定义生成结果视频的函数
|
| 119 |
def result_generate(video_clip, audio_clip):
|
|
@@ -123,18 +192,50 @@ def result_generate(video_clip, audio_clip):
|
|
| 123 |
video_buffer.seek(0)
|
| 124 |
return video_buffer
|
| 125 |
|
| 126 |
-
# 整合所有步骤到主函数
|
| 127 |
def generate_video(image):
|
|
|
|
| 128 |
text = img2text(image)
|
|
|
|
| 129 |
sentences = text2text(text)
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
audio_text = text2text_A(text)
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
# 定义 Gradio 接口
|
| 139 |
interface = gr.Interface(
|
| 140 |
fn=lambda img: generate_video(img),
|
|
|
|
| 4 |
from transformers import BlipProcessor, BlipForConditionalGeneration
|
| 5 |
import torch
|
| 6 |
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
| 7 |
+
from moviepy.editor import concatenate_videoclips, AudioFileClip
|
| 8 |
+
from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
|
| 9 |
from transformers import AutoProcessor, MusicgenForConditionalGeneration
|
| 10 |
import scipy.io.wavfile
|
| 11 |
import re
|
| 12 |
+
import numpy as np
|
| 13 |
+
import os
|
| 14 |
+
import io
|
| 15 |
+
import tempfile
|
| 16 |
|
| 17 |
# 定义图像到文本函数
|
| 18 |
def img2text(image):
|
| 19 |
+
processor = BlipProcessor.from_pretrained("blip-image-captioning-large")
|
| 20 |
+
model = BlipForConditionalGeneration.from_pretrained("blip-image-captioning-large")
|
| 21 |
inputs = processor(image, return_tensors="pt")
|
| 22 |
out = model.generate(**inputs)
|
| 23 |
caption = processor.decode(out[0], skip_special_tokens=True)
|
| 24 |
+
print(caption)
|
| 25 |
return caption
|
| 26 |
|
| 27 |
# 定义文本生成函数
|
|
|
|
| 63 |
response = requests.post(f"{base_url}/chat/completions", headers=headers, json=data)
|
| 64 |
response.raise_for_status()
|
| 65 |
completion = response.json()
|
| 66 |
+
print(completion['choices'][0]['message']['content'])
|
| 67 |
return completion['choices'][0]['message']['content']
|
| 68 |
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
import torch
|
| 73 |
+
from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
|
| 74 |
+
from diffusers.utils import export_to_gif
|
| 75 |
+
import re
|
| 76 |
+
def text2vid(input_text):
|
| 77 |
+
# 使用正则表达式分割输入文本并提取句子
|
| 78 |
+
sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
|
| 79 |
+
|
| 80 |
+
# 加载动作适配器和动画扩散管道
|
| 81 |
+
adapter = MotionAdapter.from_pretrained("/home/u2022211776/jupyterlab/AnimateLCM", config_file="/home/u2022211776/jupyterlab/AnimateLCM/config.json", torch_dtype=torch.float16)
|
| 82 |
+
pipe = AnimateDiffPipeline.from_pretrained("/home/u2022211776/jupyterlab/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
|
| 83 |
+
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
| 84 |
+
|
| 85 |
+
# 加载LoRA权重
|
| 86 |
+
pipe.load_lora_weights("/home/u2022211776/jupyterlab/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
|
| 87 |
+
|
| 88 |
+
# 设置适配器并启用功能
|
| 89 |
+
try:
|
| 90 |
+
pipe.set_adapters(["lcm-lora"], [0.8])
|
| 91 |
+
except ValueError as e:
|
| 92 |
+
print("Ignoring the error:", str(e))
|
| 93 |
+
pipe.enable_vae_slicing()
|
| 94 |
+
pipe.enable_model_cpu_offload()
|
| 95 |
+
|
| 96 |
+
all_frames = [] # 存储所有句子的所有帧
|
| 97 |
+
|
| 98 |
+
# 循环遍历每个句子,生成动画并导出为GIF
|
| 99 |
+
for index, sentence in enumerate(sentences):
|
| 100 |
+
output = pipe(
|
| 101 |
+
#prompt=sentence + ", 4k, high resolution",
|
| 102 |
+
prompt=sentence + ", cartoon",
|
| 103 |
+
negative_prompt="bad quality, worse quality, low resolution",
|
| 104 |
+
num_frames=24,
|
| 105 |
+
guidance_scale=2.0,
|
| 106 |
+
num_inference_steps=6,
|
| 107 |
+
generator=torch.Generator("cpu").manual_seed(0)
|
| 108 |
+
)
|
| 109 |
+
frames = output.frames[0]
|
| 110 |
+
all_frames.extend(frames) # 添加每个句子的帧到all_frames
|
| 111 |
+
|
| 112 |
+
return all_frames
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
|
| 117 |
# 定义文本到视频函数
|
| 118 |
def text2vid(input_text):
|
| 119 |
sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
|
| 120 |
+
adapter = MotionAdapter.from_pretrained("AnimateLCM", config_file="AnimateLCM/config.json", torch_dtype=torch.float16)
|
| 121 |
+
pipe = AnimateDiffPipeline.from_pretrained("epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
|
| 122 |
+
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
|
| 123 |
+
pipe.load_lora_weights("AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
|
| 124 |
+
try:
|
| 125 |
+
pipe.set_adapters(["lcm-lora"], [0.8])
|
| 126 |
+
except ValueError as e:
|
| 127 |
+
print("Ignoring the error:", str(e))
|
| 128 |
+
pipe.enable_vae_slicing()
|
| 129 |
+
pipe.enable_model_cpu_offload()
|
| 130 |
+
|
| 131 |
+
video_frames = []
|
| 132 |
for sentence in sentences:
|
| 133 |
+
output = pipe(
|
| 134 |
+
prompt=sentence + ", 4k, high resolution",
|
| 135 |
+
negative_prompt="bad quality, worse quality, low resolution",
|
| 136 |
+
num_frames=24,
|
| 137 |
+
guidance_scale=2.0,
|
| 138 |
+
num_inference_steps=6,
|
| 139 |
+
generator=torch.Generator("cpu").manual_seed(0)
|
| 140 |
+
)
|
| 141 |
+
video_frames.extend(output.frames[0])
|
| 142 |
+
|
| 143 |
+
return video_frames
|
| 144 |
|
| 145 |
def text2text_A(user_input):
|
| 146 |
# 设置API密钥和基础URL
|
|
|
|
| 176 |
|
| 177 |
# 定义文本到音频函数
|
| 178 |
def text2audio(text_input, duration_seconds):
|
| 179 |
+
processor = AutoProcessor.from_pretrained("musicgen-small")
|
| 180 |
+
model = MusicgenForConditionalGeneration.from_pretrained("musicgen-small")
|
| 181 |
inputs = processor(text=[text_input], padding=True, return_tensors="pt")
|
| 182 |
max_new_tokens = int((duration_seconds / 5) * 256)
|
| 183 |
audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
|
| 184 |
+
print(duration_seconds)
|
| 185 |
+
return audio_values[0, 0].numpy(), model.config.audio_encoder.sampling_rate
|
| 186 |
|
| 187 |
# 定义生成结果视频的函数
|
| 188 |
def result_generate(video_clip, audio_clip):
|
|
|
|
| 192 |
video_buffer.seek(0)
|
| 193 |
return video_buffer
|
| 194 |
|
|
|
|
| 195 |
def generate_video(image):
|
| 196 |
+
# 获取图像描述
|
| 197 |
text = img2text(image)
|
| 198 |
+
# 生成详细的文本场景描述
|
| 199 |
sentences = text2text(text)
|
| 200 |
+
# 生成视频帧
|
| 201 |
+
video_frames = text2vid(sentences)
|
| 202 |
+
|
| 203 |
+
# 转换视频帧为numpy数组
|
| 204 |
+
video_frames = [np.array(frame) for frame in video_frames]
|
| 205 |
+
|
| 206 |
+
# 创建视频片段
|
| 207 |
+
video_clip = ImageSequenceClip(video_frames, fps=24)
|
| 208 |
+
video_duration = video_clip.duration
|
| 209 |
+
|
| 210 |
+
# 生成音频数据
|
| 211 |
audio_text = text2text_A(text)
|
| 212 |
+
audio_data, audio_rate = text2audio(audio_text, video_duration)
|
| 213 |
+
|
| 214 |
+
# 将音频数据写入临时文件
|
| 215 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile:
|
| 216 |
+
scipy.io.wavfile.write(tmpfile, audio_rate, audio_data)
|
| 217 |
+
tmpfile_path = tmpfile.name
|
| 218 |
+
|
| 219 |
+
# 创建AudioFileClip对象
|
| 220 |
+
audio_clip = AudioFileClip(tmpfile_path)
|
| 221 |
|
| 222 |
+
# 将音频添加到视频中
|
| 223 |
+
video_clip = video_clip.set_audio(audio_clip)
|
| 224 |
+
print("audio_done")
|
| 225 |
+
|
| 226 |
+
# 将视频写入临时文件
|
| 227 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmpfile:
|
| 228 |
+
video_clip.write_videofile(tmpfile.name, codec="libx264", audio_codec="aac")
|
| 229 |
+
video_file_path = tmpfile.name
|
| 230 |
+
|
| 231 |
+
# 读取临时文件数据并删除
|
| 232 |
+
with open(video_file_path, 'rb') as f:
|
| 233 |
+
video_data = f.read()
|
| 234 |
+
os.remove(video_file_path)
|
| 235 |
+
os.remove(tmpfile_path)
|
| 236 |
+
print("video_done")
|
| 237 |
+
return video_data
|
| 238 |
+
|
| 239 |
# 定义 Gradio 接口
|
| 240 |
interface = gr.Interface(
|
| 241 |
fn=lambda img: generate_video(img),
|