EladSpamson commited on
Commit
221d07a
·
verified ·
1 Parent(s): 3d55353

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -64
app.py CHANGED
@@ -1,104 +1,96 @@
1
  import gradio as gr
2
  import torch
3
  import librosa
4
- import numpy as np
5
- from faster_whisper import WhisperModel
6
-
7
- # -----------------------------
8
- # 1. Load Faster Whisper Model
9
- # -----------------------------
10
- # * Use device="cuda" if a GPU is available, otherwise "cpu".
11
- model = WhisperModel(
12
- "ivrit-ai/whisper-large-v3-turbo-ct2",
13
- device="cuda" if torch.cuda.is_available() else "cpu"
14
- )
15
 
16
  # --------------------------------
17
- # 2. Global Stop Flag for Gradio
18
  # --------------------------------
19
- stop_processing = False
 
 
 
 
 
 
20
 
 
 
 
 
21
  def stop():
22
- """Set a global stop flag, letting the user interrupt transcription."""
23
  global stop_processing
24
  stop_processing = True
25
 
26
- # --------------------------------------------
27
- # 3. Transcription Function (with Chunking)
28
- # --------------------------------------------
29
  def transcribe(audio_file):
30
- """
31
- Transcribe Hebrew speech from an uploaded audio file using Faster Whisper.
32
- Splits audio into ~2-minute chunks to handle very large files (up to 60 min).
33
- """
34
  global stop_processing
35
- stop_processing = False # Reset at the start of a new transcription
36
 
37
- # A) Load Audio (Librosa) -> 16kHz
38
- sample_rate = 16000
39
- waveform, sr = librosa.load(audio_file, sr=sample_rate)
40
 
41
- # Trim audio if it exceeds 60 minutes
42
- max_audio_length = 60 * 60 # 60 minutes in seconds
43
- if len(waveform) > sr * max_audio_length:
44
- waveform = waveform[: sr * max_audio_length]
45
 
46
- # B) Split into ~2-min chunks
47
- chunk_duration = 2 * 60 # 2 minutes = 120 seconds
48
  chunks = []
49
  for start_idx in range(0, len(waveform), sr * chunk_duration):
50
  if stop_processing:
51
  return "⚠️ Transcription Stopped by User ⚠️"
52
 
53
  chunk = waveform[start_idx : start_idx + sr * chunk_duration]
54
- # Skip very short chunks (<2s) if you want
55
  if len(chunk) < sr * 2:
56
  continue
57
  chunks.append(chunk)
58
 
59
- # C) Transcribe Each Chunk with Faster Whisper
60
- all_texts = []
61
  for chunk in chunks:
62
  if stop_processing:
63
  return "⚠️ Transcription Stopped by User ⚠️"
64
 
65
- # Faster Whisper can accept a numpy array directly (float32)
66
- # Provide `sample_rate` and `language="he"` for Hebrew
67
- segment_generator, info = model.transcribe(
68
- chunk.astype(np.float32),
69
- language="he",
70
- sample_rate=sample_rate
71
- )
72
-
73
- # Gather text from each segment
74
- chunk_text = []
75
- for seg in segment_generator:
76
- if stop_processing:
77
- return "⚠️ Transcription Stopped by User ⚠️"
78
- chunk_text.append(seg.text)
79
-
80
- # Combine chunk texts
81
- all_texts.append(" ".join(chunk_text))
82
-
83
- # Join all chunk transcriptions into one final string
84
- full_text = " ".join(all_texts)
85
- return full_text
86
-
87
- # ---------------------------
88
- # 4. Build Gradio Interface
89
- # ---------------------------
90
  with gr.Blocks() as demo:
91
- gr.Markdown("## Hebrew Speech-to-Text (Faster Whisper)")
92
 
93
  audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
94
- transcription_output = gr.Textbox(label="Transcription Output")
95
 
96
  start_btn = gr.Button("Start Transcription")
97
  stop_btn = gr.Button("Stop Processing", variant="stop")
98
 
99
- # Link buttons to functions
100
- start_btn.click(transcribe, inputs=audio_input, outputs=transcription_output)
101
  stop_btn.click(stop)
102
 
103
- # Launch the Gradio app
104
  demo.launch()
 
1
  import gradio as gr
2
  import torch
3
  import librosa
4
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
 
 
 
 
 
 
 
 
 
5
 
6
  # --------------------------------
7
+ # 1) Load the Whisper Model & Processor
8
  # --------------------------------
9
+ model_id = "ivrit-ai/whisper-large-v3-turbo"
10
+ processor = WhisperProcessor.from_pretrained(model_id)
11
+ model = WhisperForConditionalGeneration.from_pretrained(model_id)
12
+
13
+ # If GPU is available, use it
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+ model.to(device)
16
 
17
+ # ---------------------------------------------------
18
+ # 2) A Global Stop Flag to Halt Mid-Transcription
19
+ # ---------------------------------------------------
20
+ stop_processing = False
21
  def stop():
 
22
  global stop_processing
23
  stop_processing = True
24
 
25
+ # ---------------------------------------------------
26
+ # 3) The Main Transcription Function with Chunking
27
+ # ---------------------------------------------------
28
  def transcribe(audio_file):
 
 
 
 
29
  global stop_processing
30
+ stop_processing = False # Reset each new transcription
31
 
32
+ # A) Load Audio with Librosa @ 16 kHz
33
+ waveform, sr = librosa.load(audio_file, sr=16000)
 
34
 
35
+ # Limit audio to 60 minutes
36
+ max_audio_sec = 60 * 60
37
+ if len(waveform) > sr * max_audio_sec:
38
+ waveform = waveform[: sr * max_audio_sec]
39
 
40
+ # B) Split audio into ~2-minute chunks
41
+ chunk_duration = 2 * 60 # 120 seconds
42
  chunks = []
43
  for start_idx in range(0, len(waveform), sr * chunk_duration):
44
  if stop_processing:
45
  return "⚠️ Transcription Stopped by User ⚠️"
46
 
47
  chunk = waveform[start_idx : start_idx + sr * chunk_duration]
48
+ # Skip super-short chunks (< 2 seconds), optional
49
  if len(chunk) < sr * 2:
50
  continue
51
  chunks.append(chunk)
52
 
53
+ # C) Transcribe Each Chunk
54
+ transcriptions = []
55
  for chunk in chunks:
56
  if stop_processing:
57
  return "⚠️ Transcription Stopped by User ⚠️"
58
 
59
+ # Prepare chunk for Whisper
60
+ inputs = processor(
61
+ chunk, sampling_rate=16000, return_tensors="pt", language="he"
62
+ ).input_features.to(device)
63
+
64
+ # Generate IDs
65
+ with torch.no_grad():
66
+ predicted_ids = model.generate(
67
+ inputs,
68
+ max_new_tokens=448, # or 444 if you prefer
69
+ do_sample=False, # deterministic
70
+ )
71
+
72
+ # Decode
73
+ text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
74
+ transcriptions.append(text)
75
+
76
+ # D) Combine Final Output
77
+ return " ".join(transcriptions)
78
+
79
+ # ------------------------------
80
+ # 4) Create a Gradio Interface
81
+ # ------------------------------
 
 
82
  with gr.Blocks() as demo:
83
+ gr.Markdown("## Hebrew Whisper: ivrit-ai/whisper-large-v3-turbo")
84
 
85
  audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
86
+ output_text = gr.Textbox(label="Transcription Output")
87
 
88
  start_btn = gr.Button("Start Transcription")
89
  stop_btn = gr.Button("Stop Processing", variant="stop")
90
 
91
+ # Button Actions
92
+ start_btn.click(transcribe, inputs=audio_input, outputs=output_text)
93
  stop_btn.click(stop)
94
 
95
+ # Launch the App
96
  demo.launch()