anzorq commited on
Commit
8ca2e83
·
verified ·
1 Parent(s): f366a55

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import torchaudio
4
+ from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
5
+
6
+ model = AutoModelForCTC.from_pretrained("anzorq/output")
7
+ processor = Wav2Vec2BertProcessor.from_pretrained("anzorq/output")
8
+
9
+ def transcribe_speech(audio):
10
+ # Load the audio file
11
+ waveform, sr = torchaudio.load(audio)
12
+
13
+ # Resample the audio if needed
14
+ resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
15
+ waveform = resampler(waveform)
16
+
17
+ # Convert to mono if needed
18
+ if waveform.dim() > 1:
19
+ waveform = torch.mean(waveform, dim=0)
20
+
21
+ # Normalize the audio
22
+ waveform = waveform / torch.max(torch.abs(waveform))
23
+
24
+ # Extract input features
25
+ input_features = processor(waveform.unsqueeze(0), sampling_rate=16000).input_features
26
+ input_features = torch.from_numpy(input_features).to("cuda" if torch.cuda.is_available() else "cpu")
27
+
28
+ # Generate logits using the model
29
+ with torch.no_grad():
30
+ logits = model(input_features).logits
31
+
32
+ # Decode the predicted ids to text
33
+ pred_ids = torch.argmax(logits, dim=-1)[0]
34
+ pred_text = processor.decode(pred_ids)
35
+
36
+ return pred_text
37
+
38
+ # Define the Gradio interface
39
+ interface = gr.Interface(
40
+ fn=transcribe_speech,
41
+ inputs=gr.Audio(source="microphone", type="filepath"),
42
+ outputs="text",
43
+ live=True,
44
+ )
45
+
46
+ # Launch the app
47
+ interface.launch()