Exceedea

Runtime error

App Files Files Community

EladSpamson commited on Feb 21

Commit

fd4a773

verified ·

1 Parent(s): 605203a

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -39

app.py CHANGED Viewed

@@ -4,85 +4,109 @@ import librosa
 import numpy as np
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
-# Load the Faster Whisper model
-model_id = "ivrit-ai/faster-whisper-v2-d4"  # Switch to a smaller, faster model
-processor = WhisperProcessor.from_pretrained(model_id)
-model = WhisperForConditionalGeneration.from_pretrained(model_id)
-# Force GPU usage (if available)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
-# Global variable to control stopping
-stop_processing = False
-# Function to stop transcription
 def stop():
     global stop_processing
-    stop_processing = True  # This will break transcription
-# Function to process long audio in chunks
 def transcribe(audio):
     global stop_processing
-    stop_processing = False  # Reset stop flag when new transcription starts
-    # Load the audio file and convert to 16kHz
     waveform, sr = librosa.load(audio, sr=16000)
-    # Set chunk size (~2 min per chunk)
-    chunk_duration = 2 * 60  # 2 minutes (120 seconds)
-    max_audio_length = 60 * 60  # 60 minutes
-    chunks = []
-    # Ensure audio doesn't exceed 60 minutes
     if len(waveform) > sr * max_audio_length:
         waveform = waveform[: sr * max_audio_length]
-    # Split audio into ~2-minute chunks
     for i in range(0, len(waveform), sr * chunk_duration):
-        if stop_processing:
             return "⚠️ Transcription Stopped by User ⚠️"
         chunk = waveform[i : i + sr * chunk_duration]
-        if len(chunk) < sr * 2:  # Skip chunks shorter than 2 seconds
             continue
         chunks.append(chunk)
-    # Process each chunk and transcribe
     transcriptions = []
     for chunk in chunks:
-        if stop_processing:
             return "⚠️ Transcription Stopped by User ⚠️"
-        input_features = processor(chunk, sampling_rate=16000, return_tensors="pt", language="he").input_features.to(device)
         with torch.no_grad():
             predicted_ids = model.generate(
-                input_features,
-                max_new_tokens=444,  # FIXED: Prevents exceeding model limit
-                do_sample=False  # Ensures stable, faster transcription
             )
-        # Decode and store transcription
-        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        transcriptions.append(transcription)
-    # Join all chunk transcriptions into one
-    full_transcription = " ".join(transcriptions)
-    return full_transcription
-# Create the Gradio Interface
 with gr.Blocks() as iface:
-    gr.Markdown("# Hebrew Speech-to-Text (Faster Whisper)")
     audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
     output_text = gr.Textbox(label="Transcription Output")
     start_btn = gr.Button("Start Transcription")
     stop_btn = gr.Button("Stop Processing", variant="stop")
     start_btn.click(transcribe, inputs=audio_input, outputs=output_text)
-    stop_btn.click(stop)  # Calls the stop function when clicked
-# Launch the Gradio app
 iface.launch()

 import numpy as np
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
+# ------------------------------
+# 1. Load the Model & Processor
+# ------------------------------
+model_id = "ivrit-ai/faster-whisper-v2-d4"  # Replace with a verified HF model if needed, e.g. "openai/whisper-large-v2"
+try:
+    processor = WhisperProcessor.from_pretrained(model_id)
+    model = WhisperForConditionalGeneration.from_pretrained(model_id)
+except OSError as e:
+    raise ValueError(
+        f"Unable to load model or tokenizer from '{model_id}'. "
+        "Double-check that the model ID is valid on Hugging Face Hub."
+    ) from e
+# Force GPU usage if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+# ---------------------------
+# 2. Global Stop Flag
+# ---------------------------
+stop_processing = False
 def stop():
+    """
+    Callback to set a global stop flag, allowing the user to interrupt
+    transcription mid-way through processing.
+    """
     global stop_processing
+    stop_processing = True
+# -------------------------------------------
+# 3. Transcription Function (with Chunking)
+# -------------------------------------------
 def transcribe(audio):
+    """
+    Transcribes Hebrew speech from an uploaded audio file.
+    Splits long audio into 2-minute chunks to handle large files (up to 60 min).
+    """
     global stop_processing
+    stop_processing = False  # Reset at start
+    # --- A) Load Audio & Limit to 60 Minutes
     waveform, sr = librosa.load(audio, sr=16000)
+    max_audio_length = 60 * 60  # 60 minutes in seconds
     if len(waveform) > sr * max_audio_length:
         waveform = waveform[: sr * max_audio_length]
+    # --- B) Split Audio into ~2-minute Chunks
+    chunk_duration = 2 * 60  # 2 minutes (120 seconds)
+    chunks = []
     for i in range(0, len(waveform), sr * chunk_duration):
+        if stop_processing:
             return "⚠️ Transcription Stopped by User ⚠️"
         chunk = waveform[i : i + sr * chunk_duration]
+        # Optional: skip very short chunks (<2 seconds)
+        if len(chunk) < sr * 2:
             continue
         chunks.append(chunk)
+    # --- C) Process Each Chunk with Whisper
     transcriptions = []
     for chunk in chunks:
+        if stop_processing:
             return "⚠️ Transcription Stopped by User ⚠️"
+        # Convert the chunk to Whisper input features
+        inputs = processor(chunk, sampling_rate=16000, return_tensors="pt", language="he").input_features.to(device)
         with torch.no_grad():
             predicted_ids = model.generate(
+                inputs,
+                max_new_tokens=444,   # Prevent exceeding model’s token limit
+                do_sample=False,      # Stable transcription (disable random sampling)
             )
+        # Decode tokens to text
+        text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        transcriptions.append(text)
+    # --- D) Combine All Chunk Transcriptions
+    return " ".join(transcriptions)
+# ------------------------
+# 4. Build Gradio Interface
+# ------------------------
 with gr.Blocks() as iface:
+    gr.Markdown("## Hebrew Speech-to-Text (Faster Whisper)")
+    # Inputs/Outputs
     audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
     output_text = gr.Textbox(label="Transcription Output")
+    # Buttons
     start_btn = gr.Button("Start Transcription")
     stop_btn = gr.Button("Stop Processing", variant="stop")
+    # Click Actions
     start_btn.click(transcribe, inputs=audio_input, outputs=output_text)
+    stop_btn.click(stop)
+# Launch the Gradio App
 iface.launch()