HongcanGuo commited on
Commit
7e348d8
·
verified ·
1 Parent(s): 90f84f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -51
app.py CHANGED
@@ -4,13 +4,10 @@ from PIL import Image
4
  from transformers import BlipProcessor, BlipForConditionalGeneration
5
  import torch
6
  from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
7
- from diffusers.utils import export_to_gif
8
  from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
9
  from transformers import AutoProcessor, MusicgenForConditionalGeneration
10
  import scipy.io.wavfile
11
  import re
12
- import glob
13
- import os
14
  from io import BytesIO
15
 
16
  # 定义图像到文本函数
@@ -63,83 +60,82 @@ def text2text(user_input):
63
  completion = response.json()
64
  return completion['choices'][0]['message']['content']
65
 
 
66
  def text2vid(input_text):
67
  sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
68
- adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM", config_file="wangfuyun/AnimateLCM/config.json", torch_dtype=torch.float16)
69
- pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
70
- pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
71
- pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
72
- try:
73
- pipe.set_adapters(["lcm-lora"], [0.8])
74
- except ValueError as e:
75
- print("Ignoring the error:", str(e))
76
- pipe.enable_vae_slicing()
77
- pipe.enable_model_cpu_offload()
78
  video_clips = []
79
  for sentence in sentences:
80
- output = pipe(
81
- prompt=sentence + ", 4k, high resolution",
82
- negative_prompt="bad quality, worse quality, low resolution",
83
- num_frames=24,
84
- guidance_scale=2.0,
85
- num_inference_steps=6,
86
- generator=torch.Generator("cpu").manual_seed(0)
87
- )
88
- frames = output.frames[0]
89
- video_clip = frames_to_video_clip(frames)
90
  video_clips.append(video_clip)
91
  final_clip = concatenate_videoclips(video_clips, method="compose")
92
  return final_clip
93
 
94
- # 定义生成最终视频的函数
95
- def video_generate():
96
- frame_rate = 24
97
- gif_files = sorted(glob.glob('./*.gif'))
98
- clips = [VideoFileClip(gif) for gif in gif_files]
99
- final_clip = concatenate_videoclips(clips, method="compose")
100
- final_clip.write_videofile('output_video.mp4', codec='libx264')
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- # 修改音频生成函数
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def text2audio(text_input, duration_seconds):
104
  processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
105
  model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
106
  inputs = processor(text=[text_input], padding=True, return_tensors="pt")
107
  max_new_tokens = int((duration_seconds / 5) * 256)
108
  audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
109
- audio_array = audio_values[0, 0].numpy()
110
- audio_clip = numpy_array_to_audio_clip(audio_array, rate=model.config.audio_encoder.sampling_rate)
111
  return audio_clip
112
 
113
- # 修改最终视频生成函数
114
  def result_generate(video_clip, audio_clip):
115
  video = video_clip.set_audio(audio_clip)
116
- video_bytes = video_clip_to_bytes(video)
117
- return video_bytes
118
-
 
119
 
120
- # 主函数,结合上述修改
121
  def generate_video(image):
122
  text = img2text(image)
123
  sentences = text2text(text)
124
  final_video_clip = text2vid(sentences)
125
- video = VideoFileClip(final_video_clip)
126
  duration = video.duration
127
- audio_text = text2text(text)
128
  audio_clip = text2audio(audio_text, duration)
129
  result_video = result_generate(final_video_clip, audio_clip)
130
  return result_video
131
 
132
  # 定义 Gradio 接口
133
- # interface = gr.Interface(
134
- # fn=generate_video,
135
- # inputs=gr.Image(type="pil"),
136
- # outputs=gr.Video(),
137
- # title="InspiroV Video Generation",
138
- # description="Upload an image to generate a video using a custom model",
139
- # theme="soft"
140
- # )
141
-
142
-
143
  interface = gr.Interface(
144
  fn=lambda img: generate_video(img),
145
  inputs=gr.Image(type="pil"),
@@ -149,6 +145,5 @@ interface = gr.Interface(
149
  theme="soft"
150
  )
151
 
152
-
153
  # 启动 Gradio 应用
154
  interface.launch()
 
4
  from transformers import BlipProcessor, BlipForConditionalGeneration
5
  import torch
6
  from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
 
7
  from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips
8
  from transformers import AutoProcessor, MusicgenForConditionalGeneration
9
  import scipy.io.wavfile
10
  import re
 
 
11
  from io import BytesIO
12
 
13
  # 定义图像到文本函数
 
60
  completion = response.json()
61
  return completion['choices'][0]['message']['content']
62
 
63
+ # 定义文本到视频函数
64
  def text2vid(input_text):
65
  sentences = re.findall(r'\[\d+\] (.+?)(?:\n|\Z)', input_text)
66
+ adapter = MotionAdapter.from_pretrained("your-motion-adapter")
67
+ pipe = AnimateDiffPipeline.from_pretrained("your-diffusion-model", motion_adapter=adapter)
 
 
 
 
 
 
 
 
68
  video_clips = []
69
  for sentence in sentences:
70
+ frames = pipe(sentence, num_inference_steps=50, guidance_scale=7.5)
71
+ video_clip = frames_to_video_clip(frames) # Assume this function converts frames to a video clip
 
 
 
 
 
 
 
 
72
  video_clips.append(video_clip)
73
  final_clip = concatenate_videoclips(video_clips, method="compose")
74
  return final_clip
75
 
76
+ def text2text_A(user_input):
77
+ # 设置API密钥和基础URL
78
+ api_key = "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"
79
+ base_url = "https://openrouter.ai/api/v1"
80
+
81
+ headers = {
82
+ "Authorization": f"Bearer {api_key}",
83
+ "Content-Type": "application/json"
84
+ }
85
+
86
+ data = {
87
+ "model": "openai/gpt-3.5-turbo",
88
+ "messages": [
89
+ {
90
+ "role": "system",
91
+ "content": (
92
+ "You are an expert in music criticism, please match this story with a suitable musical style based on my input and describe it, please make sure you follow my format output and do not add any other statements e.g. Input: in a small tavern everyone danced, the bartender poured drinks for everyone, everyone had a good time and was very happy and sang and danced. Output: 80s pop track with bassy drums and synth."
93
+ "Again, please make sure you follow the format of the output, here is my input:"
94
 
95
+ )
96
+ },
97
+ { "role": "user", "content": user_input }
98
+ ]
99
+ }
100
+
101
+ response = requests.post(f"{base_url}/chat/completions", headers=headers, json=data)
102
+ response.raise_for_status() # 确保请求成功
103
+
104
+ completion = response.json()
105
+ print(completion['choices'][0]['message']['content'])
106
+ return completion['choices'][0]['message']['content']
107
+
108
+ # 定义文本到音频函数
109
  def text2audio(text_input, duration_seconds):
110
  processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
111
  model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
112
  inputs = processor(text=[text_input], padding=True, return_tensors="pt")
113
  max_new_tokens = int((duration_seconds / 5) * 256)
114
  audio_values = model.generate(**inputs, max_new_tokens=max_new_tokens)
115
+ audio_clip = numpy_array_to_audio_clip(audio_values.numpy(), rate=model.config.audio_encoder.sampling_rate) # Assume this function converts numpy array to audio clip
 
116
  return audio_clip
117
 
118
+ # 定义生成结果视频的函数
119
  def result_generate(video_clip, audio_clip):
120
  video = video_clip.set_audio(audio_clip)
121
+ video_buffer = BytesIO()
122
+ video.write_videofile(video_buffer, codec="libx264", audio_codec="aac")
123
+ video_buffer.seek(0)
124
+ return video_buffer
125
 
126
+ # 整合所有步骤到主函数
127
  def generate_video(image):
128
  text = img2text(image)
129
  sentences = text2text(text)
130
  final_video_clip = text2vid(sentences)
131
+ video = VideoFileClip(final_video_clip) # Assumes final_video_clip is a path or BytesIO object
132
  duration = video.duration
133
+ audio_text = text2text_A(text)
134
  audio_clip = text2audio(audio_text, duration)
135
  result_video = result_generate(final_video_clip, audio_clip)
136
  return result_video
137
 
138
  # 定义 Gradio 接口
 
 
 
 
 
 
 
 
 
 
139
  interface = gr.Interface(
140
  fn=lambda img: generate_video(img),
141
  inputs=gr.Image(type="pil"),
 
145
  theme="soft"
146
  )
147
 
 
148
  # 启动 Gradio 应用
149
  interface.launch()