Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -47,6 +47,22 @@ CACHE_EXAMPLES = os.getenv('CACHE_EXAMPLES', '1') == '1'
|
|
| 47 |
|
| 48 |
base_dir = "/tmp/gradio/"
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
def analyze(path):
|
| 51 |
#Measure time for inference
|
| 52 |
start = time.time()
|
|
@@ -64,6 +80,8 @@ def analyze(path):
|
|
| 64 |
for file_path in files:
|
| 65 |
json_structure_output = os.path.join(root, file_path)
|
| 66 |
print(json_structure_output)
|
|
|
|
|
|
|
| 67 |
|
| 68 |
fig = allin1.visualize(
|
| 69 |
result,
|
|
@@ -107,9 +125,14 @@ def analyze(path):
|
|
| 107 |
|
| 108 |
def add_voice_label(json_file, audio_path):
|
| 109 |
# Load the JSON file
|
| 110 |
-
|
| 111 |
-
with open(file_path, 'r') as f:
|
| 112 |
data = json.load(f)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
# Access the segments
|
| 115 |
segments = data['segments']
|
|
@@ -118,18 +141,30 @@ def add_voice_label(json_file, audio_path):
|
|
| 118 |
for segment in segments:
|
| 119 |
start = segment['start']
|
| 120 |
end = segment['end']
|
| 121 |
-
|
| 122 |
-
audio_segment = get_audio_segment()
|
| 123 |
|
|
|
|
|
|
|
| 124 |
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
|
| 134 |
|
| 135 |
|
|
|
|
| 47 |
|
| 48 |
base_dir = "/tmp/gradio/"
|
| 49 |
|
| 50 |
+
# Defining sample rate for voice activity detection (must use multiple of 8k)
|
| 51 |
+
SAMPLING_RATE = 32000
|
| 52 |
+
torch.set_num_threads(1)
|
| 53 |
+
|
| 54 |
+
# Import of models to do voice detection
|
| 55 |
+
model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
|
| 56 |
+
model='silero_vad',
|
| 57 |
+
force_reload=True,
|
| 58 |
+
onnx=USE_ONNX)
|
| 59 |
+
|
| 60 |
+
(get_speech_timestamps,
|
| 61 |
+
save_audio,
|
| 62 |
+
read_audio,
|
| 63 |
+
VADIterator,
|
| 64 |
+
collect_chunks) = utils
|
| 65 |
+
|
| 66 |
def analyze(path):
|
| 67 |
#Measure time for inference
|
| 68 |
start = time.time()
|
|
|
|
| 80 |
for file_path in files:
|
| 81 |
json_structure_output = os.path.join(root, file_path)
|
| 82 |
print(json_structure_output)
|
| 83 |
+
|
| 84 |
+
add_voice_label(json_structure_output, path)
|
| 85 |
|
| 86 |
fig = allin1.visualize(
|
| 87 |
result,
|
|
|
|
| 125 |
|
| 126 |
def add_voice_label(json_file, audio_path):
|
| 127 |
# Load the JSON file
|
| 128 |
+
with open(json_file, 'r') as f:
|
|
|
|
| 129 |
data = json.load(f)
|
| 130 |
+
|
| 131 |
+
# Create VAD object
|
| 132 |
+
vad_iterator = VADIterator(model)
|
| 133 |
+
|
| 134 |
+
# Read input audio file
|
| 135 |
+
wav = read_audio(audio_path, sampling_rate=SAMPLING_RATE)
|
| 136 |
|
| 137 |
# Access the segments
|
| 138 |
segments = data['segments']
|
|
|
|
| 141 |
for segment in segments:
|
| 142 |
start = segment['start']
|
| 143 |
end = segment['end']
|
|
|
|
|
|
|
| 144 |
|
| 145 |
+
start_sample = int(start*SAMPLING_RATE)
|
| 146 |
+
end_sample = int(end*SAMPLING_RATE)
|
| 147 |
|
| 148 |
+
speech_probs = []
|
| 149 |
+
window_size_samples = 1536
|
| 150 |
+
for i in range(0, len(wav), window_size_samples):
|
| 151 |
+
chunk = wav[i: i+ window_size_samples]
|
| 152 |
+
if len(chunk) < window_size_samples:
|
| 153 |
+
break
|
| 154 |
+
speech_prob = model(chunk, SAMPLING_RATE).item()
|
| 155 |
+
speech_probs.append(speech_prob)
|
| 156 |
+
vad_iterator.reset_states() # reset model states after each audio
|
| 157 |
+
|
| 158 |
+
mean_probability = np.mean(speech_probs)
|
| 159 |
+
print(mean_probability)
|
| 160 |
+
|
| 161 |
+
if mean_probability >= 0.7 :
|
| 162 |
+
segment['voice'] = "Yes"
|
| 163 |
+
else:
|
| 164 |
+
segment['voice'] = "No"
|
| 165 |
+
|
| 166 |
+
with open(json_file, 'w') as f:
|
| 167 |
+
json.dump(data, f, indent=4)
|
| 168 |
|
| 169 |
|
| 170 |
|