Spaces:

anzorq
/

w2v-bert-2.0-kbd

Paused

anzorq commited on May 16, 2024

Commit

8ca2e83

verified ·

1 Parent(s): f366a55

Create app.py

Files changed (1) hide show

app.py ADDED Viewed

+import gradio as gr
+import torch
+import torchaudio
+from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
+model = AutoModelForCTC.from_pretrained("anzorq/output")
+processor = Wav2Vec2BertProcessor.from_pretrained("anzorq/output")
+def transcribe_speech(audio):
+    # Load the audio file
+    waveform, sr = torchaudio.load(audio)
+    # Resample the audio if needed
+    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+    waveform = resampler(waveform)
+    # Convert to mono if needed
+    if waveform.dim() > 1:
+        waveform = torch.mean(waveform, dim=0)
+    # Normalize the audio
+    waveform = waveform / torch.max(torch.abs(waveform))
+    # Extract input features
+    input_features = processor(waveform.unsqueeze(0), sampling_rate=16000).input_features
+    input_features = torch.from_numpy(input_features).to("cuda" if torch.cuda.is_available() else "cpu")
+    # Generate logits using the model
+    with torch.no_grad():
+        logits = model(input_features).logits
+    # Decode the predicted ids to text
+    pred_ids = torch.argmax(logits, dim=-1)[0]
+    pred_text = processor.decode(pred_ids)
+    return pred_text
+# Define the Gradio interface
+interface = gr.Interface(
+    fn=transcribe_speech,
+    inputs=gr.Audio(source="microphone", type="filepath"),
+    outputs="text",
+    live=True,
+)
+# Launch the app
+interface.launch()