anzorq commited on
Commit
0863f8c
·
verified ·
1 Parent(s): b4959b1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -64
app.py CHANGED
@@ -5,75 +5,16 @@ import torch
5
  import torchaudio
6
  from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
7
  from pytube import YouTube
 
8
 
9
- model = AutoModelForCTC.from_pretrained("anzorq/w2v-bert-2.0-kbd")
10
- processor = Wav2Vec2BertProcessor.from_pretrained("anzorq/w2v-bert-2.0-kbd")
11
-
12
- device = "cuda" if torch.cuda.is_available() else "cpu"
13
- model.to(device)
14
-
15
- # Chunk processing parameters
16
- chunk_length_s = 10 # Chunk length in seconds
17
- stride_length_s = (4, 2) # Stride lengths in seconds
18
 
19
  @spaces.GPU
20
  def transcribe_speech(audio):
21
  if audio is None: # Handle the NoneType error for microphone input
22
  return "No audio received."
23
 
24
- waveform, sr = torchaudio.load(audio)
25
-
26
- # Resample the audio if needed
27
- if sr != 16000:
28
- resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
29
- waveform = resampler(waveform)
30
-
31
- # Convert to mono if needed
32
- if waveform.dim() > 1:
33
- waveform = torch.mean(waveform, dim=0)
34
-
35
- # Ensure the waveform is a 2D tensor for chunking
36
- waveform = waveform.unsqueeze(0) # Add a dimension if it's mono
37
-
38
- # Normalize the audio
39
- waveform = waveform / torch.max(torch.abs(waveform))
40
-
41
- # Chunk the audio
42
- chunks = torch.split(waveform, int(chunk_length_s * sr), dim=1)
43
-
44
- # Process each chunk with striding
45
- full_transcription = ""
46
- for i, chunk in enumerate(chunks):
47
- with torch.no_grad():
48
- # Calculate stride lengths in frames
49
- left_stride_frames = int(stride_length_s[0] * sr)
50
- right_stride_frames = int(stride_length_s[1] * sr)
51
-
52
- # Extract the effective chunk with stride
53
- start_frame = max(0, left_stride_frames * (i - 1))
54
- end_frame = min(chunk.size(1), chunk.size(1) - right_stride_frames * i)
55
-
56
- # Check for negative duration before processing
57
- if end_frame <= start_frame:
58
- continue # Skip this chunk
59
-
60
- effective_chunk = chunk[:, start_frame:end_frame]
61
-
62
- # Extract input features
63
- input_features = processor(effective_chunk, sampling_rate=16000).input_features
64
- input_features = torch.from_numpy(input_features).to(device)
65
-
66
- # Generate logits using the model
67
- logits = model(input_features).logits
68
-
69
- # Decode the predicted ids to text
70
- pred_ids = torch.argmax(logits, dim=-1)[0]
71
- pred_text = processor.decode(pred_ids)
72
-
73
- # Append the chunk's transcription to the full transcription
74
- full_transcription += pred_text
75
-
76
- return full_transcription
77
 
78
  def transcribe_from_youtube(url):
79
  # Download audio from YouTube using pytube
@@ -118,11 +59,11 @@ with gr.Blocks() as demo:
118
  gr.Markdown("## Transcribe speech from YouTube video")
119
  youtube_url = gr.Textbox(label="Enter YouTube video URL")
120
  title = gr.Label(label="Video Title")
121
- img = gr.Image(label="Thumbnail")
122
  transcribe_button = gr.Button("Transcribe")
123
  transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
124
 
125
  transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
126
  youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
127
 
128
- demo.launch(debug=True)
 
5
  import torchaudio
6
  from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
7
  from pytube import YouTube
8
+ from transformers import pipeline
9
 
10
+ pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0)
 
 
 
 
 
 
 
 
11
 
12
  @spaces.GPU
13
  def transcribe_speech(audio):
14
  if audio is None: # Handle the NoneType error for microphone input
15
  return "No audio received."
16
 
17
+ return pipe(audio, chunk_length_s=10)['text']#, return_timestamps='word')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  def transcribe_from_youtube(url):
20
  # Download audio from YouTube using pytube
 
59
  gr.Markdown("## Transcribe speech from YouTube video")
60
  youtube_url = gr.Textbox(label="Enter YouTube video URL")
61
  title = gr.Label(label="Video Title")
62
+ img = gr.Image(label="Thumbnail", height=120, width=120)
63
  transcribe_button = gr.Button("Transcribe")
64
  transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
65
 
66
  transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
67
  youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
68
 
69
+ demo.launch()