usamaijaz-ai commited on
Commit
c8c0038
·
1 Parent(s): 09a8733

updated code to add transcription

Browse files
Files changed (2) hide show
  1. app.py +67 -79
  2. local.py +161 -0
app.py CHANGED
@@ -7,106 +7,78 @@ import subprocess
7
  import soundfile as sf
8
  from scipy.signal import resample
9
  from moviepy.editor import VideoFileClip, AudioFileClip
10
- from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
11
 
12
  # === Constants ===
13
  TEMP_VIDEO = "temp_video.mp4"
14
  RAW_AUDIO = "raw_audio_input"
15
  CONVERTED_AUDIO = "converted_audio.wav"
16
-
17
 
18
  # === load local model
19
  # MODEL_DIR = "model"
20
  # model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
21
  # feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
22
 
23
- # # === Load model from huggingface and feature extractor ===
24
- MODEL_REPO = "ylacombe/accent-classifier"
25
- model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO, cache_dir="hf_model_cache")
26
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
 
27
 
 
28
  model.eval()
29
 
30
- # === Dynamic label list from model config ===
31
- LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
 
 
 
32
 
 
 
 
 
 
 
33
 
34
- # === Download video from URL ===
35
  def download_video(url, filename=TEMP_VIDEO):
36
- import mimetypes
37
-
38
  temp_download = "raw_download.mp4"
39
- headers = {
40
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
41
- }
42
-
43
- try:
44
- r = requests.get(url, headers=headers, stream=True, timeout=15)
45
- r.raise_for_status()
46
 
47
- content_type = r.headers.get("Content-Type", "")
48
- if not content_type.startswith("video/"):
49
- raise RuntimeError(f"URL does not point to a video file. Content-Type: {content_type}")
50
 
51
- with open(temp_download, 'wb') as f:
52
- for chunk in r.iter_content(chunk_size=8192):
53
- f.write(chunk)
54
 
55
- except Exception as e:
56
- raise RuntimeError(f"Failed to download video: {e}")
 
57
 
58
- # Attempt to fix the file with ffmpeg
59
- repaired_file = filename
60
  ffmpeg_cmd = [
61
  "ffmpeg", "-y", "-i", temp_download,
62
- "-c", "copy", "-movflags", "+faststart", repaired_file
63
  ]
64
  result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
65
 
66
- if result.returncode != 0 or not os.path.exists(repaired_file) or os.path.getsize(repaired_file) == 0:
67
- print(result.stderr.decode())
68
- raise RuntimeError("FFmpeg failed to process the video. File may not be a valid MP4.")
69
 
70
  os.remove(temp_download)
71
- return repaired_file
72
 
73
-
74
-
75
-
76
- # === Extract audio from video ===
77
- def extract_audio_from_video(video_path, output_path=RAW_AUDIO + ".mp4"):
78
- clip = VideoFileClip(video_path)
79
- if clip.audio is None:
80
- raise ValueError("No audio stream found in video.")
81
- clip.audio.write_audiofile(output_path)
82
- return output_path
83
-
84
- # === Convert any input audio to WAV using ffmpeg ===
85
- def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
86
- command = ["ffmpeg", "-y", "-i", input_path, output_path]
87
- subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
88
- return output_path
89
-
90
- # === Run accent classification ===
91
  def classify_accent(audio_path):
92
  waveform, sr = sf.read(audio_path)
93
-
94
  if len(waveform.shape) > 1:
95
- waveform = waveform.mean(axis=1) # Convert stereo to mono
96
 
97
- target_sr = 16000
98
- if sr != target_sr:
99
- num_samples = int(len(waveform) * target_sr / sr)
100
  waveform = resample(waveform, num_samples)
101
- sr = target_sr
102
-
103
- inputs = feature_extractor(
104
- waveform,
105
- sampling_rate=sr,
106
- return_tensors="pt",
107
- padding=True
108
- )
109
 
 
110
  with torch.no_grad():
111
  outputs = model(**inputs)
112
  logits = outputs.logits[0]
@@ -121,52 +93,68 @@ def classify_accent(audio_path):
121
  top5_scores = [round(p, 4) for p in top5.values.tolist()]
122
  top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
123
 
124
- return f"Top prediction: {top_label}", top_conf, top_label, audio_path, top5_text
125
 
 
 
 
126
 
127
- # === Main Gradio handler ===
128
- def process_input(video_url, uploaded_audio):
129
  try:
130
  audio_path = None
131
 
132
- if uploaded_audio:
133
- shutil.copy(uploaded_audio, RAW_AUDIO)
134
  audio_path = convert_to_wav(RAW_AUDIO)
135
 
 
 
 
 
 
136
  elif video_url and video_url.strip():
137
- download_video(video_url)
138
- extracted = extract_audio_from_video(TEMP_VIDEO)
 
 
139
  audio_path = convert_to_wav(extracted)
140
 
 
141
  else:
142
- return "Please provide a video URL or upload an audio file.", None, None, None, None
 
 
 
143
 
144
- return classify_accent(audio_path)
145
 
146
  except Exception as e:
147
- return f"Error: {str(e)}", None, None, None, None
148
 
149
  finally:
150
  for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
151
  if os.path.exists(f):
152
  os.remove(f)
153
 
154
- # === Gradio UI ===
155
  interface = gr.Interface(
156
  fn=process_input,
157
  inputs=[
158
- gr.Textbox(label="Enter Loom or MP4 Video URL (optional)"),
159
- gr.Audio(label="Upload MP3 or WAV (optional)", type="filepath")
 
160
  ],
161
  outputs=[
162
  gr.Text(label="Prediction"),
163
  gr.Number(label="Confidence Score"),
164
  gr.Text(label="Accent"),
165
  gr.Audio(label="Processed Audio", type="filepath"),
166
- gr.Text(label="Top 5 Predictions")
 
167
  ],
168
- title="Accent Classifier",
169
- description="Upload an audio file or Loom/MP4 link to detect speaker's accent with top-5 prediction breakdown."
170
  )
171
 
172
  if __name__ == "__main__":
 
7
  import soundfile as sf
8
  from scipy.signal import resample
9
  from moviepy.editor import VideoFileClip, AudioFileClip
10
+ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
11
 
12
  # === Constants ===
13
  TEMP_VIDEO = "temp_video.mp4"
14
  RAW_AUDIO = "raw_audio_input"
15
  CONVERTED_AUDIO = "converted_audio.wav"
16
+ MODEL_REPO = "ylacombe/accent-classifier"
17
 
18
  # === load local model
19
  # MODEL_DIR = "model"
20
  # model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
21
  # feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
22
 
23
+
24
+ # === Load models ===
25
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
26
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
27
+ whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
28
 
29
+ LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
30
  model.eval()
31
 
32
+ # === Helpers ===
33
+ def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
34
+ command = ["ffmpeg", "-y", "-i", input_path, output_path]
35
+ subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
36
+ return output_path
37
 
38
+ def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
39
+ clip = VideoFileClip(video_path)
40
+ if clip.audio is None:
41
+ raise ValueError("No audio stream found in video.")
42
+ clip.audio.write_audiofile(output_path)
43
+ return output_path
44
 
 
45
  def download_video(url, filename=TEMP_VIDEO):
 
 
46
  temp_download = "raw_download.mp4"
47
+ headers = {"User-Agent": "Mozilla/5.0"}
 
 
 
 
 
 
48
 
49
+ r = requests.get(url, headers=headers, stream=True, timeout=15)
50
+ r.raise_for_status()
 
51
 
52
+ if not r.headers.get("Content-Type", "").startswith("video/"):
53
+ raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
 
54
 
55
+ with open(temp_download, 'wb') as f:
56
+ for chunk in r.iter_content(chunk_size=8192):
57
+ f.write(chunk)
58
 
 
 
59
  ffmpeg_cmd = [
60
  "ffmpeg", "-y", "-i", temp_download,
61
+ "-c", "copy", "-movflags", "+faststart", filename
62
  ]
63
  result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
64
 
65
+ if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
66
+ raise RuntimeError("FFmpeg failed to process the video.")
 
67
 
68
  os.remove(temp_download)
69
+ return filename
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  def classify_accent(audio_path):
72
  waveform, sr = sf.read(audio_path)
 
73
  if len(waveform.shape) > 1:
74
+ waveform = waveform.mean(axis=1)
75
 
76
+ if sr != 16000:
77
+ num_samples = int(len(waveform) * 16000 / sr)
 
78
  waveform = resample(waveform, num_samples)
79
+ sr = 16000
 
 
 
 
 
 
 
80
 
81
+ inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
82
  with torch.no_grad():
83
  outputs = model(**inputs)
84
  logits = outputs.logits[0]
 
93
  top5_scores = [round(p, 4) for p in top5.values.tolist()]
94
  top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
95
 
96
+ return top_label, top_conf, top5_text
97
 
98
+ def transcribe_audio(audio_path):
99
+ result = whisper(audio_path, return_timestamps=True)
100
+ return result.get("text", "").strip()
101
 
102
+ # === Main Handler ===
103
+ def process_input(audio_file, video_file, video_url):
104
  try:
105
  audio_path = None
106
 
107
+ if audio_file:
108
+ shutil.copy(audio_file, RAW_AUDIO)
109
  audio_path = convert_to_wav(RAW_AUDIO)
110
 
111
+ elif video_file:
112
+ shutil.copy(video_file, TEMP_VIDEO)
113
+ extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
114
+ audio_path = convert_to_wav(extracted)
115
+
116
  elif video_url and video_url.strip():
117
+ if "loom.com" in video_url:
118
+ return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
119
+ downloaded = download_video(video_url)
120
+ extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
121
  audio_path = convert_to_wav(extracted)
122
 
123
+
124
  else:
125
+ return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
126
+
127
+ label, confidence, top5 = classify_accent(audio_path)
128
+ transcription = transcribe_audio(audio_path)
129
 
130
+ return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
131
 
132
  except Exception as e:
133
+ return f"Error: {str(e)}", None, None, None, None, None
134
 
135
  finally:
136
  for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
137
  if os.path.exists(f):
138
  os.remove(f)
139
 
140
+ # === Gradio Interface ===
141
  interface = gr.Interface(
142
  fn=process_input,
143
  inputs=[
144
+ gr.Audio(label="Upload MP3 or WAV", type="filepath"),
145
+ gr.File(label="Upload MP4 Video", type="filepath"),
146
+ gr.Textbox(label="Paste Direct .mp4 Video URL")
147
  ],
148
  outputs=[
149
  gr.Text(label="Prediction"),
150
  gr.Number(label="Confidence Score"),
151
  gr.Text(label="Accent"),
152
  gr.Audio(label="Processed Audio", type="filepath"),
153
+ gr.Text(label="Top 5 Predictions"),
154
+ gr.Text(label="Transcription")
155
  ],
156
+ title="Accent Classifier + Transcriber",
157
+ description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
158
  )
159
 
160
  if __name__ == "__main__":
local.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import torch
4
+ import shutil
5
+ import requests
6
+ import subprocess
7
+ import soundfile as sf
8
+ from scipy.signal import resample
9
+ from moviepy.editor import VideoFileClip, AudioFileClip
10
+ from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
11
+
12
+ # === Constants ===
13
+ TEMP_VIDEO = "temp_video.mp4"
14
+ RAW_AUDIO = "raw_audio_input"
15
+ CONVERTED_AUDIO = "converted_audio.wav"
16
+ MODEL_REPO = "ylacombe/accent-classifier"
17
+
18
+ # === load local model
19
+ MODEL_DIR = "model"
20
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
21
+ feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
22
+
23
+
24
+ # === Load models ===
25
+ # model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
26
+ # feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
27
+ whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
28
+
29
+ LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
30
+ model.eval()
31
+
32
+ # === Helpers ===
33
+ def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
34
+ command = ["ffmpeg", "-y", "-i", input_path, output_path]
35
+ subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
36
+ return output_path
37
+
38
+ def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
39
+ clip = VideoFileClip(video_path)
40
+ if clip.audio is None:
41
+ raise ValueError("No audio stream found in video.")
42
+ clip.audio.write_audiofile(output_path)
43
+ return output_path
44
+
45
+ def download_video(url, filename=TEMP_VIDEO):
46
+ temp_download = "raw_download.mp4"
47
+ headers = {"User-Agent": "Mozilla/5.0"}
48
+
49
+ r = requests.get(url, headers=headers, stream=True, timeout=15)
50
+ r.raise_for_status()
51
+
52
+ if not r.headers.get("Content-Type", "").startswith("video/"):
53
+ raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
54
+
55
+ with open(temp_download, 'wb') as f:
56
+ for chunk in r.iter_content(chunk_size=8192):
57
+ f.write(chunk)
58
+
59
+ ffmpeg_cmd = [
60
+ "ffmpeg", "-y", "-i", temp_download,
61
+ "-c", "copy", "-movflags", "+faststart", filename
62
+ ]
63
+ result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
64
+
65
+ if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
66
+ raise RuntimeError("FFmpeg failed to process the video.")
67
+
68
+ os.remove(temp_download)
69
+ return filename
70
+
71
+ def classify_accent(audio_path):
72
+ waveform, sr = sf.read(audio_path)
73
+ if len(waveform.shape) > 1:
74
+ waveform = waveform.mean(axis=1)
75
+
76
+ if sr != 16000:
77
+ num_samples = int(len(waveform) * 16000 / sr)
78
+ waveform = resample(waveform, num_samples)
79
+ sr = 16000
80
+
81
+ inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
82
+ with torch.no_grad():
83
+ outputs = model(**inputs)
84
+ logits = outputs.logits[0]
85
+ probs = torch.nn.functional.softmax(logits, dim=-1)
86
+
87
+ top_idx = torch.argmax(probs).item()
88
+ top_label = LABELS[top_idx]
89
+ top_conf = round(probs[top_idx].item(), 4)
90
+
91
+ top5 = torch.topk(probs, k=5)
92
+ top5_labels = [LABELS[i] for i in top5.indices.tolist()]
93
+ top5_scores = [round(p, 4) for p in top5.values.tolist()]
94
+ top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
95
+
96
+ return top_label, top_conf, top5_text
97
+
98
+ def transcribe_audio(audio_path):
99
+ result = whisper(audio_path, return_timestamps=True)
100
+ return result.get("text", "").strip()
101
+
102
+ # === Main Handler ===
103
+ def process_input(audio_file, video_file, video_url):
104
+ try:
105
+ audio_path = None
106
+
107
+ if audio_file:
108
+ shutil.copy(audio_file, RAW_AUDIO)
109
+ audio_path = convert_to_wav(RAW_AUDIO)
110
+
111
+ elif video_file:
112
+ shutil.copy(video_file, TEMP_VIDEO)
113
+ extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
114
+ audio_path = convert_to_wav(extracted)
115
+
116
+ elif video_url and video_url.strip():
117
+ if "loom.com" in video_url:
118
+ return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
119
+ downloaded = download_video(video_url)
120
+ extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
121
+ audio_path = convert_to_wav(extracted)
122
+
123
+
124
+ else:
125
+ return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
126
+
127
+ label, confidence, top5 = classify_accent(audio_path)
128
+ transcription = transcribe_audio(audio_path)
129
+
130
+ return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
131
+
132
+ except Exception as e:
133
+ return f"Error: {str(e)}", None, None, None, None, None
134
+
135
+ finally:
136
+ for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
137
+ if os.path.exists(f):
138
+ os.remove(f)
139
+
140
+ # === Gradio Interface ===
141
+ interface = gr.Interface(
142
+ fn=process_input,
143
+ inputs=[
144
+ gr.Audio(label="Upload MP3 or WAV", type="filepath"),
145
+ gr.File(label="Upload MP4 Video", type="filepath"),
146
+ gr.Textbox(label="Paste Direct .mp4 Video URL")
147
+ ],
148
+ outputs=[
149
+ gr.Text(label="Prediction"),
150
+ gr.Number(label="Confidence Score"),
151
+ gr.Text(label="Accent"),
152
+ gr.Audio(label="Processed Audio", type="filepath"),
153
+ gr.Text(label="Top 5 Predictions"),
154
+ gr.Text(label="Transcription")
155
+ ],
156
+ title="Accent Classifier + Transcriber",
157
+ description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
158
+ )
159
+
160
+ if __name__ == "__main__":
161
+ interface.launch()