Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from TTS.api import TTS
|
2 |
+
from speechbrain.pretrained import SpeakerRecognition
|
3 |
+
from transformers import pipeline
|
4 |
+
import gradio as gr
|
5 |
+
import numpy as np
|
6 |
+
import soundfile as sf
|
7 |
+
from scipy.signal import resample
|
8 |
+
from scipy.io.wavfile import write as write_wav
|
9 |
+
from tempfile import NamedTemporaryFile
|
10 |
+
import os
|
11 |
+
|
12 |
+
# Load voice cloning model (XTTS)
|
13 |
+
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False)
|
14 |
+
|
15 |
+
# Load spoof detection models
|
16 |
+
sb = SpeakerRecognition.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_model")
|
17 |
+
ast_pipe = pipeline("audio-classification", model="MattyB95/AST-VoxCelebSpoof-Synthetic-Voice-Detection")
|
18 |
+
df_pipe = pipeline("audio-classification", model="MelodyMachine/Deepfake-audio-detection-V2")
|
19 |
+
|
20 |
+
def spoof_and_detect(voice_sample, desired_sr=16000):
|
21 |
+
ref_audio_array, ref_sr = voice_sample
|
22 |
+
|
23 |
+
# Resample to 16kHz
|
24 |
+
if ref_sr != desired_sr:
|
25 |
+
duration = ref_audio_array.shape[0] / ref_sr
|
26 |
+
num_samples = int(duration * desired_sr)
|
27 |
+
ref_audio_array = resample(ref_audio_array, num_samples)
|
28 |
+
ref_sr = desired_sr
|
29 |
+
|
30 |
+
# Save reference audio
|
31 |
+
with NamedTemporaryFile(suffix=".wav", mode='wb', delete=False) as ref_wav:
|
32 |
+
ref_temp_path = ref_wav.name
|
33 |
+
write_wav(ref_temp_path, ref_sr, ref_audio_array.astype("float32"))
|
34 |
+
|
35 |
+
# Clone voice
|
36 |
+
clone_path = ref_temp_path.replace(".wav", "_clone.wav")
|
37 |
+
tts.tts_to_file(
|
38 |
+
text="My voice is my password.",
|
39 |
+
speaker_wav=ref_temp_path,
|
40 |
+
file_path=clone_path,
|
41 |
+
language="en"
|
42 |
+
)
|
43 |
+
|
44 |
+
# Spoof detection
|
45 |
+
sb_score, sb_label = sb.verify_files(ref_temp_path, clone_path)
|
46 |
+
ast_ref = ast_pipe(ref_temp_path)[0]
|
47 |
+
ast_clone = ast_pipe(clone_path)[0]
|
48 |
+
df_ref = df_pipe(ref_temp_path)[0]
|
49 |
+
df_clone = df_pipe(clone_path)[0]
|
50 |
+
|
51 |
+
results = {
|
52 |
+
"SpeechBrain": str(sb_label.item()),
|
53 |
+
"AST REF": f"{ast_ref['label']} ({ast_ref['score']:.2f})",
|
54 |
+
"AST CLONE": f"{ast_clone['label']} ({ast_clone['score']:.2f})",
|
55 |
+
"Deepfake REF": f"{df_ref['label']} ({df_ref['score']:.2f})",
|
56 |
+
"Deepfake CLONE": f"{df_clone['label']} ({df_clone['score']:.2f})",
|
57 |
+
}
|
58 |
+
|
59 |
+
return ref_temp_path, clone_path, results
|
60 |
+
|
61 |
+
demo = gr.Interface(
|
62 |
+
fn=spoof_and_detect,
|
63 |
+
inputs=gr.Audio(source="microphone", type="numpy", label="🎤 Record your voice"),
|
64 |
+
outputs=[
|
65 |
+
gr.Audio(label="🎧 Original"),
|
66 |
+
gr.Audio(label="🎧 Cloned"),
|
67 |
+
gr.JSON(label="🧪 Spoof Detection Results")
|
68 |
+
],
|
69 |
+
title="🗣️ Voice Cloning + Spoof Detection",
|
70 |
+
description="Clone a speaker's voice and evaluate with 3 spoof detection models."
|
71 |
+
)
|
72 |
+
|
73 |
+
demo.launch()
|