Exceedea

Runtime error

App Files Files Community

EladSpamson commited on Feb 21

Commit

221d07a

verified ·

1 Parent(s): 3d55353

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -64

app.py CHANGED Viewed

@@ -1,104 +1,96 @@
 import gradio as gr
 import torch
 import librosa
-import numpy as np
-from faster_whisper import WhisperModel
-# -----------------------------
-# 1. Load Faster Whisper Model
-# -----------------------------
-#   * Use device="cuda" if a GPU is available, otherwise "cpu".
-model = WhisperModel(
-    "ivrit-ai/whisper-large-v3-turbo-ct2",
-    device="cuda" if torch.cuda.is_available() else "cpu"
-)
 # --------------------------------
-# 2. Global Stop Flag for Gradio
 # --------------------------------
-stop_processing = False
 def stop():
-    """Set a global stop flag, letting the user interrupt transcription."""
     global stop_processing
     stop_processing = True
-# --------------------------------------------
-# 3. Transcription Function (with Chunking)
-# --------------------------------------------
 def transcribe(audio_file):
-    """
-    Transcribe Hebrew speech from an uploaded audio file using Faster Whisper.
-    Splits audio into ~2-minute chunks to handle very large files (up to 60 min).
-    """
     global stop_processing
-    stop_processing = False  # Reset at the start of a new transcription
-    # A) Load Audio (Librosa) -> 16kHz
-    sample_rate = 16000
-    waveform, sr = librosa.load(audio_file, sr=sample_rate)
-    # Trim audio if it exceeds 60 minutes
-    max_audio_length = 60 * 60  # 60 minutes in seconds
-    if len(waveform) > sr * max_audio_length:
-        waveform = waveform[: sr * max_audio_length]
-    # B) Split into ~2-min chunks
-    chunk_duration = 2 * 60  # 2 minutes = 120 seconds
     chunks = []
     for start_idx in range(0, len(waveform), sr * chunk_duration):
         if stop_processing:
             return "⚠️ Transcription Stopped by User ⚠️"
         chunk = waveform[start_idx : start_idx + sr * chunk_duration]
-        # Skip very short chunks (<2s) if you want
         if len(chunk) < sr * 2:
             continue
         chunks.append(chunk)
-    # C) Transcribe Each Chunk with Faster Whisper
-    all_texts = []
     for chunk in chunks:
         if stop_processing:
             return "⚠️ Transcription Stopped by User ⚠️"
-        # Faster Whisper can accept a numpy array directly (float32)
-        # Provide `sample_rate` and `language="he"` for Hebrew
-        segment_generator, info = model.transcribe(
-            chunk.astype(np.float32),
-            language="he",
-            sample_rate=sample_rate
-        )
-        # Gather text from each segment
-        chunk_text = []
-        for seg in segment_generator:
-            if stop_processing:
-                return "⚠️ Transcription Stopped by User ⚠️"
-            chunk_text.append(seg.text)
-        # Combine chunk texts
-        all_texts.append(" ".join(chunk_text))
-    # Join all chunk transcriptions into one final string
-    full_text = " ".join(all_texts)
-    return full_text
-# ---------------------------
-# 4. Build Gradio Interface
-# ---------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## Hebrew Speech-to-Text (Faster Whisper)")
     audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
-    transcription_output = gr.Textbox(label="Transcription Output")
     start_btn = gr.Button("Start Transcription")
     stop_btn = gr.Button("Stop Processing", variant="stop")
-    # Link buttons to functions
-    start_btn.click(transcribe, inputs=audio_input, outputs=transcription_output)
     stop_btn.click(stop)
-# Launch the Gradio app
 demo.launch()

 import gradio as gr
 import torch
 import librosa
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
 # --------------------------------
+# 1) Load the Whisper Model & Processor
 # --------------------------------
+model_id = "ivrit-ai/whisper-large-v3-turbo"
+processor = WhisperProcessor.from_pretrained(model_id)
+model = WhisperForConditionalGeneration.from_pretrained(model_id)
+# If GPU is available, use it
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device)
+# ---------------------------------------------------
+# 2) A Global Stop Flag to Halt Mid-Transcription
+# ---------------------------------------------------
+stop_processing = False
 def stop():
     global stop_processing
     stop_processing = True
+# ---------------------------------------------------
+# 3) The Main Transcription Function with Chunking
+# ---------------------------------------------------
 def transcribe(audio_file):
     global stop_processing
+    stop_processing = False  # Reset each new transcription
+    # A) Load Audio with Librosa @ 16 kHz
+    waveform, sr = librosa.load(audio_file, sr=16000)
+    # Limit audio to 60 minutes
+    max_audio_sec = 60 * 60
+    if len(waveform) > sr * max_audio_sec:
+        waveform = waveform[: sr * max_audio_sec]
+    # B) Split audio into ~2-minute chunks
+    chunk_duration = 2 * 60  # 120 seconds
     chunks = []
     for start_idx in range(0, len(waveform), sr * chunk_duration):
         if stop_processing:
             return "⚠️ Transcription Stopped by User ⚠️"
         chunk = waveform[start_idx : start_idx + sr * chunk_duration]
+        # Skip super-short chunks (< 2 seconds), optional
         if len(chunk) < sr * 2:
             continue
         chunks.append(chunk)
+    # C) Transcribe Each Chunk
+    transcriptions = []
     for chunk in chunks:
         if stop_processing:
             return "⚠️ Transcription Stopped by User ⚠️"
+        # Prepare chunk for Whisper
+        inputs = processor(
+            chunk, sampling_rate=16000, return_tensors="pt", language="he"
+        ).input_features.to(device)
+        # Generate IDs
+        with torch.no_grad():
+            predicted_ids = model.generate(
+                inputs,
+                max_new_tokens=448,  # or 444 if you prefer
+                do_sample=False,     # deterministic
+            )
+        # Decode
+        text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        transcriptions.append(text)
+    # D) Combine Final Output
+    return " ".join(transcriptions)
+# ------------------------------
+# 4) Create a Gradio Interface
+# ------------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("## Hebrew Whisper: ivrit-ai/whisper-large-v3-turbo")
     audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
+    output_text = gr.Textbox(label="Transcription Output")
     start_btn = gr.Button("Start Transcription")
     stop_btn = gr.Button("Stop Processing", variant="stop")
+    # Button Actions
+    start_btn.click(transcribe, inputs=audio_input, outputs=output_text)
     stop_btn.click(stop)
+# Launch the App
 demo.launch()