Spaces:

herme
/

hcpwhisper

Runtime error

App Files Files Community

herme commited on Jun 23, 2023

Commit

bdaccf4

1 Parent(s): 150d5bd

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -107

app.py CHANGED Viewed

@@ -1,109 +1,82 @@
-import whisper
-import gradio as gr
-import datetime
-import subprocess
-import torch
-import pyannote.audio
-from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
-from pyannote.audio import Audio
-from pyannote.core import Segment
-import wave
-import contextlib
-from sklearn.cluster import AgglomerativeClustering
-import numpy as np
-model = whisper.load_model("large-v2")
-embedding_model = PretrainedSpeakerEmbedding(
-    "speechbrain/spkrec-ecapa-voxceleb",
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-)
-def transcribe(audio, num_speakers):
-  path, error = convert_to_wav(audio)
-  if error is not None:
-    return error
-  duration = get_duration(path)
-  if duration > 4 * 60 * 60:
-    return "Audio duration too long"
-  result = model.transcribe(path)
-  segments = result["segments"]
-  num_speakers = min(max(round(num_speakers), 1), len(segments))
-  if len(segments) == 1:
-    segments[0]['speaker'] = 'SPEAKER 1'
-  else:
-    embeddings = make_embeddings(path, segments, duration)
-    add_speaker_labels(segments, embeddings, num_speakers)
-  output = get_output(segments)
-  return output
-def convert_to_wav(path):
-  if path[-3:] != 'wav':
-    new_path = '.'.join(path.split('.')[:-1]) + '.wav'
-    try:
-      subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
-    except:
-      return path, 'Error: Could not convert file to .wav'
-    path = new_path
-  return path, None
-def get_duration(path):
-  with contextlib.closing(wave.open(path,'r')) as f:
-    frames = f.getnframes()
-    rate = f.getframerate()
-    return frames / float(rate)
-def make_embeddings(path, segments, duration):
-  embeddings = np.zeros(shape=(len(segments), 192))
-  for i, segment in enumerate(segments):
-    embeddings[i] = segment_embedding(path, segment, duration)
-  return np.nan_to_num(embeddings)
-audio = Audio()
-def segment_embedding(path, segment, duration):
-  start = segment["start"]
-  # Whisper overshoots the end timestamp in the last segment
-  end = min(duration, segment["end"])
-  clip = Segment(start, end)
-  waveform, sample_rate = audio.crop(path, clip)
-  return embedding_model(waveform[None])
-def add_speaker_labels(segments, embeddings, num_speakers):
-  clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
-  labels = clustering.labels_
-  for i in range(len(segments)):
-    segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
-def time(secs):
-  return datetime.timedelta(seconds=round(secs))
-def get_output(segments):
-  output = ''
-  for (i, segment) in enumerate(segments):
-    if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
-      if i != 0:
-        output += '\n\n'
-      output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
-    output += segment["text"][1:] + ' '
-  return output
-gr.Interface(
-    title = 'Prueba Whisper Audio to Text ',
-    fn=transcribe,
-    inputs=[
-        gr.inputs.Audio(source="upload", type="filepath"),
-        gr.inputs.Number(default=2, label="Number of Speakers")
-    ],
-    outputs=[
-        gr.outputs.Textbox(label='Transcript')
     ]
-  ).launch()

+from typing import Dict
+import gradio as gr
+import whisper
+from whisper.tokenizer import get_tokenizer
+import classify
+model_cache = {}
+def zero_shot_classify(audio_path: str, class_names: str, model_name: str) -> Dict[str, float]:
+    class_names = class_names.split(",")
+    tokenizer = get_tokenizer(multilingual=".en" not in model_name)
+    if model_name not in model_cache:
+        model = whisper.load_model(model_name)
+        model_cache[model_name] = model
+    else:
+        model = model_cache[model_name]
+    internal_lm_average_logprobs = classify.calculate_internal_lm_average_logprobs(
+        model=model,
+        class_names=class_names,
+        tokenizer=tokenizer,
+    )
+    audio_features = classify.calculate_audio_features(audio_path, model)
+    average_logprobs = classify.calculate_average_logprobs(
+        model=model,
+        audio_features=audio_features,
+        class_names=class_names,
+        tokenizer=tokenizer,
+    )
+    average_logprobs -= internal_lm_average_logprobs
+    scores = average_logprobs.softmax(-1).tolist()
+    return {class_name: score for class_name, score in zip(class_names, scores)}
+def main():
+    CLASS_NAMES = "[dog barking],[helicopter whirring],[laughing],[birds chirping],[clock ticking]"
+    AUDIO_PATHS = [
+        "./data/(dog)1-100032-A-0.wav",
+        "./data/(helicopter)1-181071-A-40.wav",
+        "./data/(laughing)1-1791-A-26.wav",
+        "./data/(chirping_birds)1-34495-A-14.wav",
+        "./data/(clock_tick)1-21934-A-38.wav",
     ]
+    EXAMPLES = []
+    for audio_path in AUDIO_PATHS:
+        EXAMPLES.append([audio_path, CLASS_NAMES, "small"])
+    DESCRIPTION = (
+        '<div style="text-align: center;">'
+        "<p>This demo allows you to try out zero-shot audio classification using "
+        "<a href=https://github.com/openai/whisper>Whisper</a>.</p>"
+        "<p>Github: <a href=https://github.com/jumon/zac>https://github.com/jumon/zac</a></p>"
+        "<p>Example audio files are from the <a href=https://github.com/karolpiczak/ESC-50>ESC-50"
+        "</a> dataset (CC BY-NC 3.0).</p></div>"
+    )
+    demo = gr.Interface(
+        fn=zero_shot_classify,
+        inputs=[
+            gr.Audio(source="upload", type="filepath", label="Audio File"),
+            gr.Textbox(lines=1, label="Candidate class names (comma-separated)"),
+            gr.Radio(
+                choices=["tiny", "base", "small", "medium", "large"],
+                value="small",
+                label="Model Name",
+            ),
+        ],
+        outputs="label",
+        examples=EXAMPLES,
+        title="Zero-shot Audio Classification using Whisper",
+        description=DESCRIPTION,
+    )
+    demo.launch()
+if __name__ == "__main__":
+    main()