Bisher commited on
Commit
0a08e17
·
verified ·
1 Parent(s): b934afb
Files changed (1) hide show
  1. app.py +34 -28
app.py CHANGED
@@ -1,29 +1,35 @@
1
- from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
2
- import torch
3
- import gradio as gr
4
- import librosa
5
-
6
- # load model and processor
7
- processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
8
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
9
-
10
- # define prediction function
11
- def audio2phoneme(audio_path):
12
- audio, sr = librosa.load(audio_path, sr=16000)
13
- input_values = processor(audio, return_tensors="pt", padding=True).input_values
14
- with torch.no_grad():
15
- logits = model(input_values).logits
16
- predicted_ids = torch.argmax(logits, dim=-1)
17
- transcription = processor.batch_decode(predicted_ids)
18
- return ' '.join(transcription)
19
-
20
- app = gr.Interface(
21
- fn=audio2phoneme,
22
- inputs=gr.Audio(sources=["upload","microphone"], type="filepath"),
23
- outputs=gr.Textbox(label="Phoneme Transcription", show_copy_button=True, show_label=True),
24
- description="Get phonemes from audio",
25
- title="Audio to Phoneme Transcription using facebook/wav2vec2-lv-60-espeak-cv",
26
- )
27
-
28
- # start space
 
 
 
 
 
 
29
  app.launch(share=True)
 
1
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
2
+ import torch
3
+ import gradio as gr
4
+ import librosa
5
+ import os
6
+ import subprocess
7
+
8
+ # Install system dependencies
9
+ subprocess.run(["apt-get", "update"], check=True)
10
+ subprocess.run(["apt-get", "install", "-y", "espeak"], check=True)
11
+
12
+ # load model and processor
13
+ processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
14
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
15
+
16
+ # define prediction function
17
+ def audio2phoneme(audio_path):
18
+ audio, sr = librosa.load(audio_path, sr=16000)
19
+ input_values = processor(audio, return_tensors="pt", padding=True).input_values
20
+ with torch.no_grad():
21
+ logits = model(input_values).logits
22
+ predicted_ids = torch.argmax(logits, dim=-1)
23
+ transcription = processor.batch_decode(predicted_ids)
24
+ return ' '.join(transcription)
25
+
26
+ app = gr.Interface(
27
+ fn=audio2phoneme,
28
+ inputs=gr.Audio(sources=["upload","microphone"], type="filepath"),
29
+ outputs=gr.Textbox(label="Phoneme Transcription", show_copy_button=True, show_label=True),
30
+ description="Get phonemes from audio",
31
+ title="Audio to Phoneme Transcription using facebook/wav2vec2-lv-60-espeak-cv",
32
+ )
33
+
34
+ # start space
35
  app.launch(share=True)