Spaces:
Sleeping
Sleeping
Commit
·
c8c0038
1
Parent(s):
09a8733
updated code to add transcription
Browse files
app.py
CHANGED
@@ -7,106 +7,78 @@ import subprocess
|
|
7 |
import soundfile as sf
|
8 |
from scipy.signal import resample
|
9 |
from moviepy.editor import VideoFileClip, AudioFileClip
|
10 |
-
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
|
11 |
|
12 |
# === Constants ===
|
13 |
TEMP_VIDEO = "temp_video.mp4"
|
14 |
RAW_AUDIO = "raw_audio_input"
|
15 |
CONVERTED_AUDIO = "converted_audio.wav"
|
16 |
-
|
17 |
|
18 |
# === load local model
|
19 |
# MODEL_DIR = "model"
|
20 |
# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
|
21 |
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
|
22 |
|
23 |
-
|
24 |
-
|
25 |
-
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO
|
26 |
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
|
|
|
27 |
|
|
|
28 |
model.eval()
|
29 |
|
30 |
-
# ===
|
31 |
-
|
|
|
|
|
|
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
# === Download video from URL ===
|
35 |
def download_video(url, filename=TEMP_VIDEO):
|
36 |
-
import mimetypes
|
37 |
-
|
38 |
temp_download = "raw_download.mp4"
|
39 |
-
headers = {
|
40 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
|
41 |
-
}
|
42 |
-
|
43 |
-
try:
|
44 |
-
r = requests.get(url, headers=headers, stream=True, timeout=15)
|
45 |
-
r.raise_for_status()
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
raise RuntimeError(f"URL does not point to a video file. Content-Type: {content_type}")
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
f.write(chunk)
|
54 |
|
55 |
-
|
56 |
-
|
|
|
57 |
|
58 |
-
# Attempt to fix the file with ffmpeg
|
59 |
-
repaired_file = filename
|
60 |
ffmpeg_cmd = [
|
61 |
"ffmpeg", "-y", "-i", temp_download,
|
62 |
-
"-c", "copy", "-movflags", "+faststart",
|
63 |
]
|
64 |
result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
65 |
|
66 |
-
if result.returncode != 0 or not os.path.exists(
|
67 |
-
|
68 |
-
raise RuntimeError("FFmpeg failed to process the video. File may not be a valid MP4.")
|
69 |
|
70 |
os.remove(temp_download)
|
71 |
-
return
|
72 |
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
# === Extract audio from video ===
|
77 |
-
def extract_audio_from_video(video_path, output_path=RAW_AUDIO + ".mp4"):
|
78 |
-
clip = VideoFileClip(video_path)
|
79 |
-
if clip.audio is None:
|
80 |
-
raise ValueError("No audio stream found in video.")
|
81 |
-
clip.audio.write_audiofile(output_path)
|
82 |
-
return output_path
|
83 |
-
|
84 |
-
# === Convert any input audio to WAV using ffmpeg ===
|
85 |
-
def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
|
86 |
-
command = ["ffmpeg", "-y", "-i", input_path, output_path]
|
87 |
-
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
88 |
-
return output_path
|
89 |
-
|
90 |
-
# === Run accent classification ===
|
91 |
def classify_accent(audio_path):
|
92 |
waveform, sr = sf.read(audio_path)
|
93 |
-
|
94 |
if len(waveform.shape) > 1:
|
95 |
-
waveform = waveform.mean(axis=1)
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
num_samples = int(len(waveform) * target_sr / sr)
|
100 |
waveform = resample(waveform, num_samples)
|
101 |
-
sr =
|
102 |
-
|
103 |
-
inputs = feature_extractor(
|
104 |
-
waveform,
|
105 |
-
sampling_rate=sr,
|
106 |
-
return_tensors="pt",
|
107 |
-
padding=True
|
108 |
-
)
|
109 |
|
|
|
110 |
with torch.no_grad():
|
111 |
outputs = model(**inputs)
|
112 |
logits = outputs.logits[0]
|
@@ -121,52 +93,68 @@ def classify_accent(audio_path):
|
|
121 |
top5_scores = [round(p, 4) for p in top5.values.tolist()]
|
122 |
top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
|
123 |
|
124 |
-
return
|
125 |
|
|
|
|
|
|
|
126 |
|
127 |
-
# === Main
|
128 |
-
def process_input(
|
129 |
try:
|
130 |
audio_path = None
|
131 |
|
132 |
-
if
|
133 |
-
shutil.copy(
|
134 |
audio_path = convert_to_wav(RAW_AUDIO)
|
135 |
|
|
|
|
|
|
|
|
|
|
|
136 |
elif video_url and video_url.strip():
|
137 |
-
|
138 |
-
|
|
|
|
|
139 |
audio_path = convert_to_wav(extracted)
|
140 |
|
|
|
141 |
else:
|
142 |
-
return "Please provide a video
|
|
|
|
|
|
|
143 |
|
144 |
-
return
|
145 |
|
146 |
except Exception as e:
|
147 |
-
return f"Error: {str(e)}", None, None, None, None
|
148 |
|
149 |
finally:
|
150 |
for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
|
151 |
if os.path.exists(f):
|
152 |
os.remove(f)
|
153 |
|
154 |
-
# === Gradio
|
155 |
interface = gr.Interface(
|
156 |
fn=process_input,
|
157 |
inputs=[
|
158 |
-
gr.
|
159 |
-
gr.
|
|
|
160 |
],
|
161 |
outputs=[
|
162 |
gr.Text(label="Prediction"),
|
163 |
gr.Number(label="Confidence Score"),
|
164 |
gr.Text(label="Accent"),
|
165 |
gr.Audio(label="Processed Audio", type="filepath"),
|
166 |
-
gr.Text(label="Top 5 Predictions")
|
|
|
167 |
],
|
168 |
-
title="Accent Classifier",
|
169 |
-
description="Upload an audio file
|
170 |
)
|
171 |
|
172 |
if __name__ == "__main__":
|
|
|
7 |
import soundfile as sf
|
8 |
from scipy.signal import resample
|
9 |
from moviepy.editor import VideoFileClip, AudioFileClip
|
10 |
+
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
|
11 |
|
12 |
# === Constants ===
|
13 |
TEMP_VIDEO = "temp_video.mp4"
|
14 |
RAW_AUDIO = "raw_audio_input"
|
15 |
CONVERTED_AUDIO = "converted_audio.wav"
|
16 |
+
MODEL_REPO = "ylacombe/accent-classifier"
|
17 |
|
18 |
# === load local model
|
19 |
# MODEL_DIR = "model"
|
20 |
# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
|
21 |
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
|
22 |
|
23 |
+
|
24 |
+
# === Load models ===
|
25 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
|
26 |
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
|
27 |
+
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
28 |
|
29 |
+
LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
|
30 |
model.eval()
|
31 |
|
32 |
+
# === Helpers ===
|
33 |
+
def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
|
34 |
+
command = ["ffmpeg", "-y", "-i", input_path, output_path]
|
35 |
+
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
36 |
+
return output_path
|
37 |
|
38 |
+
def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
|
39 |
+
clip = VideoFileClip(video_path)
|
40 |
+
if clip.audio is None:
|
41 |
+
raise ValueError("No audio stream found in video.")
|
42 |
+
clip.audio.write_audiofile(output_path)
|
43 |
+
return output_path
|
44 |
|
|
|
45 |
def download_video(url, filename=TEMP_VIDEO):
|
|
|
|
|
46 |
temp_download = "raw_download.mp4"
|
47 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
+
r = requests.get(url, headers=headers, stream=True, timeout=15)
|
50 |
+
r.raise_for_status()
|
|
|
51 |
|
52 |
+
if not r.headers.get("Content-Type", "").startswith("video/"):
|
53 |
+
raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
|
|
|
54 |
|
55 |
+
with open(temp_download, 'wb') as f:
|
56 |
+
for chunk in r.iter_content(chunk_size=8192):
|
57 |
+
f.write(chunk)
|
58 |
|
|
|
|
|
59 |
ffmpeg_cmd = [
|
60 |
"ffmpeg", "-y", "-i", temp_download,
|
61 |
+
"-c", "copy", "-movflags", "+faststart", filename
|
62 |
]
|
63 |
result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
64 |
|
65 |
+
if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
|
66 |
+
raise RuntimeError("FFmpeg failed to process the video.")
|
|
|
67 |
|
68 |
os.remove(temp_download)
|
69 |
+
return filename
|
70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
def classify_accent(audio_path):
|
72 |
waveform, sr = sf.read(audio_path)
|
|
|
73 |
if len(waveform.shape) > 1:
|
74 |
+
waveform = waveform.mean(axis=1)
|
75 |
|
76 |
+
if sr != 16000:
|
77 |
+
num_samples = int(len(waveform) * 16000 / sr)
|
|
|
78 |
waveform = resample(waveform, num_samples)
|
79 |
+
sr = 16000
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
81 |
+
inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
|
82 |
with torch.no_grad():
|
83 |
outputs = model(**inputs)
|
84 |
logits = outputs.logits[0]
|
|
|
93 |
top5_scores = [round(p, 4) for p in top5.values.tolist()]
|
94 |
top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
|
95 |
|
96 |
+
return top_label, top_conf, top5_text
|
97 |
|
98 |
+
def transcribe_audio(audio_path):
|
99 |
+
result = whisper(audio_path, return_timestamps=True)
|
100 |
+
return result.get("text", "").strip()
|
101 |
|
102 |
+
# === Main Handler ===
|
103 |
+
def process_input(audio_file, video_file, video_url):
|
104 |
try:
|
105 |
audio_path = None
|
106 |
|
107 |
+
if audio_file:
|
108 |
+
shutil.copy(audio_file, RAW_AUDIO)
|
109 |
audio_path = convert_to_wav(RAW_AUDIO)
|
110 |
|
111 |
+
elif video_file:
|
112 |
+
shutil.copy(video_file, TEMP_VIDEO)
|
113 |
+
extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
|
114 |
+
audio_path = convert_to_wav(extracted)
|
115 |
+
|
116 |
elif video_url and video_url.strip():
|
117 |
+
if "loom.com" in video_url:
|
118 |
+
return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
|
119 |
+
downloaded = download_video(video_url)
|
120 |
+
extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
|
121 |
audio_path = convert_to_wav(extracted)
|
122 |
|
123 |
+
|
124 |
else:
|
125 |
+
return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
|
126 |
+
|
127 |
+
label, confidence, top5 = classify_accent(audio_path)
|
128 |
+
transcription = transcribe_audio(audio_path)
|
129 |
|
130 |
+
return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
|
131 |
|
132 |
except Exception as e:
|
133 |
+
return f"Error: {str(e)}", None, None, None, None, None
|
134 |
|
135 |
finally:
|
136 |
for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
|
137 |
if os.path.exists(f):
|
138 |
os.remove(f)
|
139 |
|
140 |
+
# === Gradio Interface ===
|
141 |
interface = gr.Interface(
|
142 |
fn=process_input,
|
143 |
inputs=[
|
144 |
+
gr.Audio(label="Upload MP3 or WAV", type="filepath"),
|
145 |
+
gr.File(label="Upload MP4 Video", type="filepath"),
|
146 |
+
gr.Textbox(label="Paste Direct .mp4 Video URL")
|
147 |
],
|
148 |
outputs=[
|
149 |
gr.Text(label="Prediction"),
|
150 |
gr.Number(label="Confidence Score"),
|
151 |
gr.Text(label="Accent"),
|
152 |
gr.Audio(label="Processed Audio", type="filepath"),
|
153 |
+
gr.Text(label="Top 5 Predictions"),
|
154 |
+
gr.Text(label="Transcription")
|
155 |
],
|
156 |
+
title="Accent Classifier + Transcriber",
|
157 |
+
description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
|
158 |
)
|
159 |
|
160 |
if __name__ == "__main__":
|
local.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import torch
|
4 |
+
import shutil
|
5 |
+
import requests
|
6 |
+
import subprocess
|
7 |
+
import soundfile as sf
|
8 |
+
from scipy.signal import resample
|
9 |
+
from moviepy.editor import VideoFileClip, AudioFileClip
|
10 |
+
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor, pipeline
|
11 |
+
|
12 |
+
# === Constants ===
|
13 |
+
TEMP_VIDEO = "temp_video.mp4"
|
14 |
+
RAW_AUDIO = "raw_audio_input"
|
15 |
+
CONVERTED_AUDIO = "converted_audio.wav"
|
16 |
+
MODEL_REPO = "ylacombe/accent-classifier"
|
17 |
+
|
18 |
+
# === load local model
|
19 |
+
MODEL_DIR = "model"
|
20 |
+
model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_DIR, local_files_only=True)
|
21 |
+
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_DIR)
|
22 |
+
|
23 |
+
|
24 |
+
# === Load models ===
|
25 |
+
# model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_REPO)
|
26 |
+
# feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_REPO)
|
27 |
+
whisper = pipeline("automatic-speech-recognition", model="openai/whisper-tiny")
|
28 |
+
|
29 |
+
LABELS = [model.config.id2label[i] for i in range(len(model.config.id2label))]
|
30 |
+
model.eval()
|
31 |
+
|
32 |
+
# === Helpers ===
|
33 |
+
def convert_to_wav(input_path, output_path=CONVERTED_AUDIO):
|
34 |
+
command = ["ffmpeg", "-y", "-i", input_path, output_path]
|
35 |
+
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
36 |
+
return output_path
|
37 |
+
|
38 |
+
def extract_audio_from_video(video_path, output_path="extracted_audio.wav"):
|
39 |
+
clip = VideoFileClip(video_path)
|
40 |
+
if clip.audio is None:
|
41 |
+
raise ValueError("No audio stream found in video.")
|
42 |
+
clip.audio.write_audiofile(output_path)
|
43 |
+
return output_path
|
44 |
+
|
45 |
+
def download_video(url, filename=TEMP_VIDEO):
|
46 |
+
temp_download = "raw_download.mp4"
|
47 |
+
headers = {"User-Agent": "Mozilla/5.0"}
|
48 |
+
|
49 |
+
r = requests.get(url, headers=headers, stream=True, timeout=15)
|
50 |
+
r.raise_for_status()
|
51 |
+
|
52 |
+
if not r.headers.get("Content-Type", "").startswith("video/"):
|
53 |
+
raise RuntimeError(f"URL is not a video. Content-Type: {r.headers.get('Content-Type')}")
|
54 |
+
|
55 |
+
with open(temp_download, 'wb') as f:
|
56 |
+
for chunk in r.iter_content(chunk_size=8192):
|
57 |
+
f.write(chunk)
|
58 |
+
|
59 |
+
ffmpeg_cmd = [
|
60 |
+
"ffmpeg", "-y", "-i", temp_download,
|
61 |
+
"-c", "copy", "-movflags", "+faststart", filename
|
62 |
+
]
|
63 |
+
result = subprocess.run(ffmpeg_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
64 |
+
|
65 |
+
if result.returncode != 0 or not os.path.exists(filename) or os.path.getsize(filename) == 0:
|
66 |
+
raise RuntimeError("FFmpeg failed to process the video.")
|
67 |
+
|
68 |
+
os.remove(temp_download)
|
69 |
+
return filename
|
70 |
+
|
71 |
+
def classify_accent(audio_path):
|
72 |
+
waveform, sr = sf.read(audio_path)
|
73 |
+
if len(waveform.shape) > 1:
|
74 |
+
waveform = waveform.mean(axis=1)
|
75 |
+
|
76 |
+
if sr != 16000:
|
77 |
+
num_samples = int(len(waveform) * 16000 / sr)
|
78 |
+
waveform = resample(waveform, num_samples)
|
79 |
+
sr = 16000
|
80 |
+
|
81 |
+
inputs = feature_extractor(waveform, sampling_rate=sr, return_tensors="pt", padding=True)
|
82 |
+
with torch.no_grad():
|
83 |
+
outputs = model(**inputs)
|
84 |
+
logits = outputs.logits[0]
|
85 |
+
probs = torch.nn.functional.softmax(logits, dim=-1)
|
86 |
+
|
87 |
+
top_idx = torch.argmax(probs).item()
|
88 |
+
top_label = LABELS[top_idx]
|
89 |
+
top_conf = round(probs[top_idx].item(), 4)
|
90 |
+
|
91 |
+
top5 = torch.topk(probs, k=5)
|
92 |
+
top5_labels = [LABELS[i] for i in top5.indices.tolist()]
|
93 |
+
top5_scores = [round(p, 4) for p in top5.values.tolist()]
|
94 |
+
top5_text = "\n".join([f"{label}: {score}" for label, score in zip(top5_labels, top5_scores)])
|
95 |
+
|
96 |
+
return top_label, top_conf, top5_text
|
97 |
+
|
98 |
+
def transcribe_audio(audio_path):
|
99 |
+
result = whisper(audio_path, return_timestamps=True)
|
100 |
+
return result.get("text", "").strip()
|
101 |
+
|
102 |
+
# === Main Handler ===
|
103 |
+
def process_input(audio_file, video_file, video_url):
|
104 |
+
try:
|
105 |
+
audio_path = None
|
106 |
+
|
107 |
+
if audio_file:
|
108 |
+
shutil.copy(audio_file, RAW_AUDIO)
|
109 |
+
audio_path = convert_to_wav(RAW_AUDIO)
|
110 |
+
|
111 |
+
elif video_file:
|
112 |
+
shutil.copy(video_file, TEMP_VIDEO)
|
113 |
+
extracted = extract_audio_from_video(TEMP_VIDEO, output_path="extracted_audio.wav")
|
114 |
+
audio_path = convert_to_wav(extracted)
|
115 |
+
|
116 |
+
elif video_url and video_url.strip():
|
117 |
+
if "loom.com" in video_url:
|
118 |
+
return "Loom links are not supported. Please upload the file or use a direct .mp4 URL.", None, None, None, None, None
|
119 |
+
downloaded = download_video(video_url)
|
120 |
+
extracted = extract_audio_from_video(downloaded, output_path="extracted_audio.wav")
|
121 |
+
audio_path = convert_to_wav(extracted)
|
122 |
+
|
123 |
+
|
124 |
+
else:
|
125 |
+
return "Please provide an audio file, a video file, or a direct video URL.", None, None, None, None, None
|
126 |
+
|
127 |
+
label, confidence, top5 = classify_accent(audio_path)
|
128 |
+
transcription = transcribe_audio(audio_path)
|
129 |
+
|
130 |
+
return f"Top prediction: {label}", confidence, label, audio_path, top5, transcription
|
131 |
+
|
132 |
+
except Exception as e:
|
133 |
+
return f"Error: {str(e)}", None, None, None, None, None
|
134 |
+
|
135 |
+
finally:
|
136 |
+
for f in [TEMP_VIDEO, RAW_AUDIO, CONVERTED_AUDIO, RAW_AUDIO + ".mp4"]:
|
137 |
+
if os.path.exists(f):
|
138 |
+
os.remove(f)
|
139 |
+
|
140 |
+
# === Gradio Interface ===
|
141 |
+
interface = gr.Interface(
|
142 |
+
fn=process_input,
|
143 |
+
inputs=[
|
144 |
+
gr.Audio(label="Upload MP3 or WAV", type="filepath"),
|
145 |
+
gr.File(label="Upload MP4 Video", type="filepath"),
|
146 |
+
gr.Textbox(label="Paste Direct .mp4 Video URL")
|
147 |
+
],
|
148 |
+
outputs=[
|
149 |
+
gr.Text(label="Prediction"),
|
150 |
+
gr.Number(label="Confidence Score"),
|
151 |
+
gr.Text(label="Accent"),
|
152 |
+
gr.Audio(label="Processed Audio", type="filepath"),
|
153 |
+
gr.Text(label="Top 5 Predictions"),
|
154 |
+
gr.Text(label="Transcription")
|
155 |
+
],
|
156 |
+
title="Accent Classifier + Transcriber",
|
157 |
+
description="Upload an audio or video file OR paste a direct video URL to classify the accent and transcribe the speech."
|
158 |
+
)
|
159 |
+
|
160 |
+
if __name__ == "__main__":
|
161 |
+
interface.launch()
|