Ahmadkhan12 commited on
Commit
5f0b0b8
·
verified ·
1 Parent(s): daffb79

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +210 -0
app.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install required libraries
2
+ !pip install gradio moviepy torch torchaudio soundfile pillow numpy scipy transformers
3
+
4
+ # Import libraries
5
+ import os
6
+ import gradio as gr
7
+ import torch
8
+ import soundfile as sf
9
+ import numpy as np
10
+ from PIL import Image
11
+ import torch.nn.functional as F
12
+ import logging
13
+ from scipy.io.wavfile import write as write_wav
14
+ from scipy import signal
15
+ from moviepy.editor import VideoFileClip, AudioFileClip
16
+ from transformers import AutoProcessor, AutoModelForAudioGeneration
17
+
18
+ # Set up logging for better debug tracking
19
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
20
+ logger = logging.getLogger()
21
+
22
+ # Load Places365 model for scene detection (on CPU to save GPU memory)
23
+ try:
24
+ logging.info("Loading Places365 model for scene detection...")
25
+ places365 = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
26
+ places365.eval()
27
+ places365.to("cpu") # Move to CPU
28
+ logging.info("Places365 model loaded successfully.")
29
+ except Exception as e:
30
+ logging.error(f"Error loading Places365 model: {e}")
31
+ raise
32
+
33
+ # Load Places365 class labels
34
+ !wget http://places2.csail.mit.edu/models_places365/categories_places365.txt
35
+ with open("categories_places365.txt", "r") as f:
36
+ SCENE_CLASSES = [line.strip().split(" ")[0][3:] for line in f.readlines()]
37
+
38
+ # Load AudioGen Medium and MusicGen Medium models
39
+ try:
40
+ logging.info("Loading AudioGen Medium and MusicGen Medium models...")
41
+ audiogen_processor = AutoProcessor.from_pretrained("facebook/audiogen-medium")
42
+ audiogen_model = AutoModelForAudioGeneration.from_pretrained("facebook/audiogen-medium")
43
+ musicgen_processor = AutoProcessor.from_pretrained("facebook/musicgen-medium")
44
+ musicgen_model = AutoModelForAudioGeneration.from_pretrained("facebook/musicgen-medium")
45
+
46
+ # Move models to GPU if available
47
+ device = "cuda" if torch.cuda.is_available() else "cpu"
48
+ audiogen_model.to(device)
49
+ musicgen_model.to(device)
50
+ logging.info("AudioGen Medium and MusicGen Medium models loaded successfully.")
51
+ except Exception as e:
52
+ logging.error(f"Error loading AudioGen/MusicGen models: {e}")
53
+ raise
54
+
55
+ # Function to classify a frame using Places365
56
+ def classify_frame(frame):
57
+ try:
58
+ preprocess = transforms.Compose([
59
+ transforms.Resize(128), # Smaller resolution
60
+ transforms.CenterCrop(128),
61
+ transforms.ToTensor(),
62
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
63
+ ])
64
+ img = Image.fromarray(frame)
65
+ img = preprocess(img).unsqueeze(0)
66
+ with torch.no_grad():
67
+ output = places365(img.to("cpu")) # Ensure inference on CPU
68
+ probabilities = F.softmax(output, dim=1)
69
+ _, predicted = torch.max(probabilities, 1)
70
+ predicted_index = predicted.item()
71
+
72
+ # Ensure the predicted index is within the range of SCENE_CLASSES
73
+ if predicted_index >= len(SCENE_CLASSES) or predicted_index < 0:
74
+ logging.warning(f"Predicted class index {predicted_index} is out of range. Defaulting to 'nature'.")
75
+ return "nature" # Default scene type
76
+
77
+ scene_type = SCENE_CLASSES[predicted_index]
78
+ logging.info(f"Predicted scene: {scene_type}")
79
+ return scene_type
80
+ except Exception as e:
81
+ logging.error(f"Error classifying frame: {e}")
82
+ raise
83
+
84
+ # Function to analyze video content and return the scene type using Places365
85
+ def analyze_video(video_path):
86
+ try:
87
+ logging.info(f"Analyzing video: {video_path}")
88
+ clip = VideoFileClip(video_path)
89
+ frame = clip.get_frame(0) # Get the first frame
90
+ frame = Image.fromarray(frame) # Convert to PIL image
91
+ frame = np.array(frame.resize((128, 128))) # Resize to reduce memory usage
92
+
93
+ # Classify the frame using Places365
94
+ scene_type = classify_frame(frame)
95
+ logging.info(f"Scene type detected: {scene_type}")
96
+ return scene_type
97
+ except Exception as e:
98
+ logging.error(f"Error analyzing video: {e}")
99
+ raise
100
+
101
+ # Function to generate audio using AudioGen Medium
102
+ def generate_audio_audiogen(scene, duration=10):
103
+ try:
104
+ logging.info(f"Generating audio for scene: {scene} using AudioGen Medium...")
105
+ inputs = audiogen_processor(
106
+ text=[f"Ambient sounds of {scene}"],
107
+ padding=True,
108
+ return_tensors="pt",
109
+ ).to(audiogen_model.device) # Move inputs to the same device as the model
110
+ with torch.no_grad():
111
+ audio = audiogen_model.generate(**inputs, max_new_tokens=duration * 50) # Adjust tokens for duration
112
+ audio = audio.cpu().numpy().squeeze()
113
+ audio_path = "generated_audio_audiogen.wav"
114
+ write_wav(audio_path, 16000, audio) # Save as WAV file
115
+ logging.info(f"Audio generated and saved to: {audio_path}")
116
+ return audio_path
117
+ except Exception as e:
118
+ logging.error(f"Error generating audio with AudioGen Medium: {e}")
119
+ raise
120
+
121
+ # Function to generate music using MusicGen Medium
122
+ def generate_music_musicgen(scene, duration=10):
123
+ try:
124
+ logging.info(f"Generating music for scene: {scene} using MusicGen Medium...")
125
+ inputs = musicgen_processor(
126
+ text=[f"Calm music for {scene}"],
127
+ padding=True,
128
+ return_tensors="pt",
129
+ ).to(musicgen_model.device) # Move inputs to the same device as the model
130
+ with torch.no_grad():
131
+ music = musicgen_model.generate(**inputs, max_new_tokens=duration * 50) # Adjust tokens for duration
132
+ music = music.cpu().numpy().squeeze()
133
+ music_path = "generated_music_musicgen.wav"
134
+ write_wav(music_path, 16000, music) # Save as WAV file
135
+ logging.info(f"Music generated and saved to: {music_path}")
136
+ return music_path
137
+ except Exception as e:
138
+ logging.error(f"Error generating music with MusicGen Medium: {e}")
139
+ raise
140
+
141
+ # Function to merge audio and video into a final video file using moviepy
142
+ def merge_audio_video(video_path, audio_path, output_path="output.mp4"):
143
+ try:
144
+ logging.info("Merging audio and video using moviepy...")
145
+ video_clip = VideoFileClip(video_path)
146
+ audio_clip = AudioFileClip(audio_path)
147
+ final_clip = video_clip.set_audio(audio_clip)
148
+ final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
149
+ logging.info(f"Final video saved to: {output_path}")
150
+ return output_path
151
+ except Exception as e:
152
+ logging.error(f"Error merging audio and video: {e}")
153
+ return None
154
+
155
+ # Main processing function to handle video upload, scene analysis, and video output
156
+ def process_video(video_path, progress=gr.Progress()):
157
+ try:
158
+ progress(0.1, desc="Starting video processing...")
159
+ logging.info("Starting video processing...")
160
+
161
+ # Analyze the video to determine the scene type
162
+ progress(0.3, desc="Analyzing video...")
163
+ scene_type = analyze_video(video_path)
164
+
165
+ # Generate audio using AudioGen Medium
166
+ progress(0.5, desc="Generating audio...")
167
+ audio_path = generate_audio_audiogen(scene_type, duration=10)
168
+
169
+ # Generate music using MusicGen Medium
170
+ progress(0.7, desc="Generating music...")
171
+ music_path = generate_music_musicgen(scene_type, duration=10)
172
+
173
+ # Merge the generated audio with the video and output the final video
174
+ progress(0.9, desc="Merging audio and video...")
175
+ output_path = merge_audio_video(video_path, music_path)
176
+ if not output_path:
177
+ return "Error: Failed to merge audio and video.", "Logs: Merge failed."
178
+
179
+ logging.info("Video processing completed successfully.")
180
+ return output_path, "Logs: Processing completed."
181
+ except Exception as e:
182
+ logging.error(f"Error in process_video: {e}")
183
+ return f"An error occurred during processing: {e}", f"Logs: {e}"
184
+
185
+ # Gradio UI for video upload
186
+ def gradio_interface(video_file, progress=gr.Progress()):
187
+ try:
188
+ progress(0.1, desc="Starting video processing...")
189
+ logging.info("Gradio interface triggered.")
190
+ output_video, logs = process_video(video_file, progress)
191
+ return output_video, logs
192
+ except Exception as e:
193
+ logging.error(f"Error in Gradio interface: {e}")
194
+ return f"An error occurred: {e}", f"Logs: {e}"
195
+
196
+ # Launch Gradio app
197
+ try:
198
+ logging.info("Launching Gradio app...")
199
+ interface = gr.Interface(
200
+ fn=gradio_interface,
201
+ inputs=[gr.Video(label="Upload Video")],
202
+ outputs=[gr.Video(label="Output Video with Generated Audio"), gr.Textbox(label="Logs", lines=10)],
203
+ title="Video to Video with Generated Audio and Music",
204
+ description="Upload a video, and this app will analyze it and generate matching audio and music using AudioGen Medium and MusicGen Medium."
205
+ )
206
+ interface.queue() # Enable queue for long-running tasks
207
+ interface.launch(share=True) # Launch the app
208
+ except Exception as e:
209
+ logging.error(f"Error launching Gradio app: {e}")
210
+ raise