Ahmadkhan12 commited on
Commit
14984e1
·
verified ·
1 Parent(s): 961fd0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -157
app.py CHANGED
@@ -11,11 +11,24 @@ from scipy.io.wavfile import write as write_wav
11
  from scipy import signal
12
  from moviepy.editor import VideoFileClip, AudioFileClip
13
  from transformers import AutoProcessor, AutoModelForAudioGeneration
 
14
 
15
  # Set up logging for better debug tracking
16
  logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
17
  logger = logging.getLogger()
18
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # Load Places365 model for scene detection (on CPU to save GPU memory)
20
  try:
21
  logging.info("Loading Places365 model for scene detection...")
@@ -28,7 +41,6 @@ except Exception as e:
28
  raise
29
 
30
  # Load Places365 class labels
31
- !wget http://places2.csail.mit.edu/models_places365/categories_places365.txt
32
  with open("categories_places365.txt", "r") as f:
33
  SCENE_CLASSES = [line.strip().split(" ")[0][3:] for line in f.readlines()]
34
 
@@ -49,159 +61,4 @@ except Exception as e:
49
  logging.error(f"Error loading AudioGen/MusicGen models: {e}")
50
  raise
51
 
52
- # Function to classify a frame using Places365
53
- def classify_frame(frame):
54
- try:
55
- preprocess = transforms.Compose([
56
- transforms.Resize(128), # Smaller resolution
57
- transforms.CenterCrop(128),
58
- transforms.ToTensor(),
59
- transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
60
- ])
61
- img = Image.fromarray(frame)
62
- img = preprocess(img).unsqueeze(0)
63
- with torch.no_grad():
64
- output = places365(img.to("cpu")) # Ensure inference on CPU
65
- probabilities = F.softmax(output, dim=1)
66
- _, predicted = torch.max(probabilities, 1)
67
- predicted_index = predicted.item()
68
-
69
- # Ensure the predicted index is within the range of SCENE_CLASSES
70
- if predicted_index >= len(SCENE_CLASSES) or predicted_index < 0:
71
- logging.warning(f"Predicted class index {predicted_index} is out of range. Defaulting to 'nature'.")
72
- return "nature" # Default scene type
73
-
74
- scene_type = SCENE_CLASSES[predicted_index]
75
- logging.info(f"Predicted scene: {scene_type}")
76
- return scene_type
77
- except Exception as e:
78
- logging.error(f"Error classifying frame: {e}")
79
- raise
80
-
81
- # Function to analyze video content and return the scene type using Places365
82
- def analyze_video(video_path):
83
- try:
84
- logging.info(f"Analyzing video: {video_path}")
85
- clip = VideoFileClip(video_path)
86
- frame = clip.get_frame(0) # Get the first frame
87
- frame = Image.fromarray(frame) # Convert to PIL image
88
- frame = np.array(frame.resize((128, 128))) # Resize to reduce memory usage
89
-
90
- # Classify the frame using Places365
91
- scene_type = classify_frame(frame)
92
- logging.info(f"Scene type detected: {scene_type}")
93
- return scene_type
94
- except Exception as e:
95
- logging.error(f"Error analyzing video: {e}")
96
- raise
97
-
98
- # Function to generate audio using AudioGen Medium
99
- def generate_audio_audiogen(scene, duration=10):
100
- try:
101
- logging.info(f"Generating audio for scene: {scene} using AudioGen Medium...")
102
- inputs = audiogen_processor(
103
- text=[f"Ambient sounds of {scene}"],
104
- padding=True,
105
- return_tensors="pt",
106
- ).to(audiogen_model.device) # Move inputs to the same device as the model
107
- with torch.no_grad():
108
- audio = audiogen_model.generate(**inputs, max_new_tokens=duration * 50) # Adjust tokens for duration
109
- audio = audio.cpu().numpy().squeeze()
110
- audio_path = "generated_audio_audiogen.wav"
111
- write_wav(audio_path, 16000, audio) # Save as WAV file
112
- logging.info(f"Audio generated and saved to: {audio_path}")
113
- return audio_path
114
- except Exception as e:
115
- logging.error(f"Error generating audio with AudioGen Medium: {e}")
116
- raise
117
-
118
- # Function to generate music using MusicGen Medium
119
- def generate_music_musicgen(scene, duration=10):
120
- try:
121
- logging.info(f"Generating music for scene: {scene} using MusicGen Medium...")
122
- inputs = musicgen_processor(
123
- text=[f"Calm music for {scene}"],
124
- padding=True,
125
- return_tensors="pt",
126
- ).to(musicgen_model.device) # Move inputs to the same device as the model
127
- with torch.no_grad():
128
- music = musicgen_model.generate(**inputs, max_new_tokens=duration * 50) # Adjust tokens for duration
129
- music = music.cpu().numpy().squeeze()
130
- music_path = "generated_music_musicgen.wav"
131
- write_wav(music_path, 16000, music) # Save as WAV file
132
- logging.info(f"Music generated and saved to: {music_path}")
133
- return music_path
134
- except Exception as e:
135
- logging.error(f"Error generating music with MusicGen Medium: {e}")
136
- raise
137
-
138
- # Function to merge audio and video into a final video file using moviepy
139
- def merge_audio_video(video_path, audio_path, output_path="output.mp4"):
140
- try:
141
- logging.info("Merging audio and video using moviepy...")
142
- video_clip = VideoFileClip(video_path)
143
- audio_clip = AudioFileClip(audio_path)
144
- final_clip = video_clip.set_audio(audio_clip)
145
- final_clip.write_videofile(output_path, codec="libx264", audio_codec="aac")
146
- logging.info(f"Final video saved to: {output_path}")
147
- return output_path
148
- except Exception as e:
149
- logging.error(f"Error merging audio and video: {e}")
150
- return None
151
-
152
- # Main processing function to handle video upload, scene analysis, and video output
153
- def process_video(video_path, progress=gr.Progress()):
154
- try:
155
- progress(0.1, desc="Starting video processing...")
156
- logging.info("Starting video processing...")
157
-
158
- # Analyze the video to determine the scene type
159
- progress(0.3, desc="Analyzing video...")
160
- scene_type = analyze_video(video_path)
161
-
162
- # Generate audio using AudioGen Medium
163
- progress(0.5, desc="Generating audio...")
164
- audio_path = generate_audio_audiogen(scene_type, duration=10)
165
-
166
- # Generate music using MusicGen Medium
167
- progress(0.7, desc="Generating music...")
168
- music_path = generate_music_musicgen(scene_type, duration=10)
169
-
170
- # Merge the generated audio with the video and output the final video
171
- progress(0.9, desc="Merging audio and video...")
172
- output_path = merge_audio_video(video_path, music_path)
173
- if not output_path:
174
- return "Error: Failed to merge audio and video.", "Logs: Merge failed."
175
-
176
- logging.info("Video processing completed successfully.")
177
- return output_path, "Logs: Processing completed."
178
- except Exception as e:
179
- logging.error(f"Error in process_video: {e}")
180
- return f"An error occurred during processing: {e}", f"Logs: {e}"
181
-
182
- # Gradio UI for video upload
183
- def gradio_interface(video_file, progress=gr.Progress()):
184
- try:
185
- progress(0.1, desc="Starting video processing...")
186
- logging.info("Gradio interface triggered.")
187
- output_video, logs = process_video(video_file, progress)
188
- return output_video, logs
189
- except Exception as e:
190
- logging.error(f"Error in Gradio interface: {e}")
191
- return f"An error occurred: {e}", f"Logs: {e}"
192
-
193
- # Launch Gradio app
194
- try:
195
- logging.info("Launching Gradio app...")
196
- interface = gr.Interface(
197
- fn=gradio_interface,
198
- inputs=[gr.Video(label="Upload Video")],
199
- outputs=[gr.Video(label="Output Video with Generated Audio"), gr.Textbox(label="Logs", lines=10)],
200
- title="Video to Video with Generated Audio and Music",
201
- description="Upload a video, and this app will analyze it and generate matching audio and music using AudioGen Medium and MusicGen Medium."
202
- )
203
- interface.queue() # Enable queue for long-running tasks
204
- interface.launch(share=True) # Launch the app
205
- except Exception as e:
206
- logging.error(f"Error launching Gradio app: {e}")
207
- raise
 
11
  from scipy import signal
12
  from moviepy.editor import VideoFileClip, AudioFileClip
13
  from transformers import AutoProcessor, AutoModelForAudioGeneration
14
+ import requests # Add this line
15
 
16
  # Set up logging for better debug tracking
17
  logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
18
  logger = logging.getLogger()
19
 
20
+ # Download Places365 class labels
21
+ try:
22
+ logging.info("Downloading Places365 class labels...")
23
+ url = "http://places2.csail.mit.edu/models_places365/categories_places365.txt"
24
+ response = requests.get(url)
25
+ with open("categories_places365.txt", "wb") as f:
26
+ f.write(response.content)
27
+ logging.info("Places365 class labels downloaded successfully.")
28
+ except Exception as e:
29
+ logging.error(f"Error downloading Places365 class labels: {e}")
30
+ raise
31
+
32
  # Load Places365 model for scene detection (on CPU to save GPU memory)
33
  try:
34
  logging.info("Loading Places365 model for scene detection...")
 
41
  raise
42
 
43
  # Load Places365 class labels
 
44
  with open("categories_places365.txt", "r") as f:
45
  SCENE_CLASSES = [line.strip().split(" ")[0][3:] for line in f.readlines()]
46
 
 
61
  logging.error(f"Error loading AudioGen/MusicGen models: {e}")
62
  raise
63
 
64
+ # Rest of the code remains the same...