EladSpamson commited on
Commit
67f033c
·
verified ·
1 Parent(s): e611aba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -10
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import gradio as gr
2
  import torch
3
  import librosa
 
4
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
5
 
6
  # Load the Whisper model
@@ -12,24 +13,57 @@ model = WhisperForConditionalGeneration.from_pretrained(model_id)
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  model.to(device)
14
 
15
- # Function to transcribe Hebrew audio
16
  def transcribe(audio):
17
- waveform, sr = librosa.load(audio, sr=16000) # Convert to 16kHz
18
- input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features.to(device)
19
 
20
- with torch.no_grad():
21
- predicted_ids = model.generate(input_features)
 
 
22
 
23
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
24
- return transcription
 
25
 
26
- # FIXED: Removed 'source="upload"' from gr.Audio()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  iface = gr.Interface(
28
  fn=transcribe,
29
- inputs=gr.Audio(type="filepath"), # Corrected line
30
  outputs="text",
31
  title="Hebrew Speech-to-Text (Whisper)",
32
- description="Upload a Hebrew audio file and receive a transcription.",
33
  )
34
 
 
35
  iface.launch()
 
1
  import gradio as gr
2
  import torch
3
  import librosa
4
+ import numpy as np
5
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
6
 
7
  # Load the Whisper model
 
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
  model.to(device)
15
 
16
+ # Function to process long audio in ~3-5 min chunks
17
  def transcribe(audio):
18
+ # Load the audio file and convert to 16kHz
19
+ waveform, sr = librosa.load(audio, sr=16000)
20
 
21
+ # Set chunk size (~3-5 minutes per chunk)
22
+ chunk_duration = 4 * 60 # 4 minutes (240 seconds)
23
+ max_audio_length = 60 * 60 # 60 minutes
24
+ chunks = []
25
 
26
+ # Ensure audio doesn't exceed 60 minutes
27
+ if len(waveform) > sr * max_audio_length:
28
+ waveform = waveform[: sr * max_audio_length]
29
 
30
+ # Split audio into ~4-minute chunks
31
+ for i in range(0, len(waveform), sr * chunk_duration):
32
+ chunk = waveform[i : i + sr * chunk_duration]
33
+ if len(chunk) < sr * 2: # Skip chunks shorter than 2 seconds
34
+ continue
35
+ chunks.append(chunk)
36
+
37
+ # Process each chunk and transcribe
38
+ transcriptions = []
39
+ for chunk in chunks:
40
+ input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features.to(device)
41
+
42
+ with torch.no_grad():
43
+ predicted_ids = model.generate(
44
+ input_features,
45
+ max_new_tokens=500, # 500 tokens (~3-5 min speech)
46
+ return_timestamps=True, # Keeps transcription aligned
47
+ do_sample=True, # Prevents early stopping
48
+ temperature=0.7
49
+ )
50
+
51
+ # Decode and store transcription
52
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
53
+ transcriptions.append(transcription)
54
+
55
+ # Join all chunk transcriptions into one
56
+ full_transcription = " ".join(transcriptions)
57
+ return full_transcription
58
+
59
+ # Create the Gradio Interface
60
  iface = gr.Interface(
61
  fn=transcribe,
62
+ inputs=gr.Audio(type="filepath"), # Fixed input format
63
  outputs="text",
64
  title="Hebrew Speech-to-Text (Whisper)",
65
+ description="Upload a Hebrew audio file (up to 60 minutes) for full transcription.",
66
  )
67
 
68
+ # Launch the Gradio app
69
  iface.launch()