EladSpamson commited on
Commit
3d55353
·
verified ·
1 Parent(s): fd4a773

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -67
app.py CHANGED
@@ -2,111 +2,103 @@ import gradio as gr
2
  import torch
3
  import librosa
4
  import numpy as np
5
- from transformers import WhisperProcessor, WhisperForConditionalGeneration
6
-
7
- # ------------------------------
8
- # 1. Load the Model & Processor
9
- # ------------------------------
10
- model_id = "ivrit-ai/faster-whisper-v2-d4" # Replace with a verified HF model if needed, e.g. "openai/whisper-large-v2"
11
-
12
- try:
13
- processor = WhisperProcessor.from_pretrained(model_id)
14
- model = WhisperForConditionalGeneration.from_pretrained(model_id)
15
- except OSError as e:
16
- raise ValueError(
17
- f"Unable to load model or tokenizer from '{model_id}'. "
18
- "Double-check that the model ID is valid on Hugging Face Hub."
19
- ) from e
20
-
21
- # Force GPU usage if available
22
- device = "cuda" if torch.cuda.is_available() else "cpu"
23
- model.to(device)
24
-
25
- # ---------------------------
26
- # 2. Global Stop Flag
27
- # ---------------------------
28
  stop_processing = False
29
 
30
  def stop():
31
- """
32
- Callback to set a global stop flag, allowing the user to interrupt
33
- transcription mid-way through processing.
34
- """
35
  global stop_processing
36
  stop_processing = True
37
 
38
-
39
- # -------------------------------------------
40
  # 3. Transcription Function (with Chunking)
41
- # -------------------------------------------
42
- def transcribe(audio):
43
  """
44
- Transcribes Hebrew speech from an uploaded audio file.
45
- Splits long audio into 2-minute chunks to handle large files (up to 60 min).
46
  """
47
  global stop_processing
48
- stop_processing = False # Reset at start
 
 
 
 
49
 
50
- # --- A) Load Audio & Limit to 60 Minutes
51
- waveform, sr = librosa.load(audio, sr=16000)
52
  max_audio_length = 60 * 60 # 60 minutes in seconds
53
  if len(waveform) > sr * max_audio_length:
54
  waveform = waveform[: sr * max_audio_length]
55
 
56
- # --- B) Split Audio into ~2-minute Chunks
57
- chunk_duration = 2 * 60 # 2 minutes (120 seconds)
58
  chunks = []
59
- for i in range(0, len(waveform), sr * chunk_duration):
60
  if stop_processing:
61
  return "⚠️ Transcription Stopped by User ⚠️"
62
 
63
- chunk = waveform[i : i + sr * chunk_duration]
64
- # Optional: skip very short chunks (<2 seconds)
65
  if len(chunk) < sr * 2:
66
  continue
67
  chunks.append(chunk)
68
 
69
- # --- C) Process Each Chunk with Whisper
70
- transcriptions = []
71
  for chunk in chunks:
72
  if stop_processing:
73
  return "⚠️ Transcription Stopped by User ⚠️"
74
 
75
- # Convert the chunk to Whisper input features
76
- inputs = processor(chunk, sampling_rate=16000, return_tensors="pt", language="he").input_features.to(device)
77
-
78
- with torch.no_grad():
79
- predicted_ids = model.generate(
80
- inputs,
81
- max_new_tokens=444, # Prevent exceeding model’s token limit
82
- do_sample=False, # Stable transcription (disable random sampling)
83
- )
84
 
85
- # Decode tokens to text
86
- text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
87
- transcriptions.append(text)
 
 
 
88
 
89
- # --- D) Combine All Chunk Transcriptions
90
- return " ".join(transcriptions)
91
 
 
 
 
92
 
93
- # ------------------------
94
  # 4. Build Gradio Interface
95
- # ------------------------
96
- with gr.Blocks() as iface:
97
  gr.Markdown("## Hebrew Speech-to-Text (Faster Whisper)")
98
 
99
- # Inputs/Outputs
100
  audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
101
- output_text = gr.Textbox(label="Transcription Output")
102
 
103
- # Buttons
104
  start_btn = gr.Button("Start Transcription")
105
  stop_btn = gr.Button("Stop Processing", variant="stop")
106
 
107
- # Click Actions
108
- start_btn.click(transcribe, inputs=audio_input, outputs=output_text)
109
  stop_btn.click(stop)
110
 
111
- # Launch the Gradio App
112
- iface.launch()
 
2
  import torch
3
  import librosa
4
  import numpy as np
5
+ from faster_whisper import WhisperModel
6
+
7
+ # -----------------------------
8
+ # 1. Load Faster Whisper Model
9
+ # -----------------------------
10
+ # * Use device="cuda" if a GPU is available, otherwise "cpu".
11
+ model = WhisperModel(
12
+ "ivrit-ai/whisper-large-v3-turbo-ct2",
13
+ device="cuda" if torch.cuda.is_available() else "cpu"
14
+ )
15
+
16
+ # --------------------------------
17
+ # 2. Global Stop Flag for Gradio
18
+ # --------------------------------
 
 
 
 
 
 
 
 
 
19
  stop_processing = False
20
 
21
  def stop():
22
+ """Set a global stop flag, letting the user interrupt transcription."""
 
 
 
23
  global stop_processing
24
  stop_processing = True
25
 
26
+ # --------------------------------------------
 
27
  # 3. Transcription Function (with Chunking)
28
+ # --------------------------------------------
29
+ def transcribe(audio_file):
30
  """
31
+ Transcribe Hebrew speech from an uploaded audio file using Faster Whisper.
32
+ Splits audio into ~2-minute chunks to handle very large files (up to 60 min).
33
  """
34
  global stop_processing
35
+ stop_processing = False # Reset at the start of a new transcription
36
+
37
+ # A) Load Audio (Librosa) -> 16kHz
38
+ sample_rate = 16000
39
+ waveform, sr = librosa.load(audio_file, sr=sample_rate)
40
 
41
+ # Trim audio if it exceeds 60 minutes
 
42
  max_audio_length = 60 * 60 # 60 minutes in seconds
43
  if len(waveform) > sr * max_audio_length:
44
  waveform = waveform[: sr * max_audio_length]
45
 
46
+ # B) Split into ~2-min chunks
47
+ chunk_duration = 2 * 60 # 2 minutes = 120 seconds
48
  chunks = []
49
+ for start_idx in range(0, len(waveform), sr * chunk_duration):
50
  if stop_processing:
51
  return "⚠️ Transcription Stopped by User ⚠️"
52
 
53
+ chunk = waveform[start_idx : start_idx + sr * chunk_duration]
54
+ # Skip very short chunks (<2s) if you want
55
  if len(chunk) < sr * 2:
56
  continue
57
  chunks.append(chunk)
58
 
59
+ # C) Transcribe Each Chunk with Faster Whisper
60
+ all_texts = []
61
  for chunk in chunks:
62
  if stop_processing:
63
  return "⚠️ Transcription Stopped by User ⚠️"
64
 
65
+ # Faster Whisper can accept a numpy array directly (float32)
66
+ # Provide `sample_rate` and `language="he"` for Hebrew
67
+ segment_generator, info = model.transcribe(
68
+ chunk.astype(np.float32),
69
+ language="he",
70
+ sample_rate=sample_rate
71
+ )
 
 
72
 
73
+ # Gather text from each segment
74
+ chunk_text = []
75
+ for seg in segment_generator:
76
+ if stop_processing:
77
+ return "⚠️ Transcription Stopped by User ⚠️"
78
+ chunk_text.append(seg.text)
79
 
80
+ # Combine chunk texts
81
+ all_texts.append(" ".join(chunk_text))
82
 
83
+ # Join all chunk transcriptions into one final string
84
+ full_text = " ".join(all_texts)
85
+ return full_text
86
 
87
+ # ---------------------------
88
  # 4. Build Gradio Interface
89
+ # ---------------------------
90
+ with gr.Blocks() as demo:
91
  gr.Markdown("## Hebrew Speech-to-Text (Faster Whisper)")
92
 
 
93
  audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
94
+ transcription_output = gr.Textbox(label="Transcription Output")
95
 
 
96
  start_btn = gr.Button("Start Transcription")
97
  stop_btn = gr.Button("Stop Processing", variant="stop")
98
 
99
+ # Link buttons to functions
100
+ start_btn.click(transcribe, inputs=audio_input, outputs=transcription_output)
101
  stop_btn.click(stop)
102
 
103
+ # Launch the Gradio app
104
+ demo.launch()