Exceedea

Runtime error

App Files Files Community

EladSpamson commited on Feb 22

Commit

8be8710

verified ·

1 Parent(s): c0ea370

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -80

app.py CHANGED Viewed

@@ -10,93 +10,50 @@ model = WhisperForConditionalGeneration.from_pretrained(model_id)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
-# Force Hebrew transcription
-forced_decoder_ids = processor.get_decoder_prompt_ids(
-    language="he",
-    task="transcribe"
-)
-stop_processing = False
-def stop():
-    global stop_processing
-    stop_processing = True
-def transcribe_30_seconds(audio_file):
-    """
-    Process only the first 30 seconds of the audio, in small 5-second chunks.
-    Return partial text chunk by chunk (generator).
-    """
-    global stop_processing
-    stop_processing = False
-    # 1) Load at 16kHz
     waveform, sr = librosa.load(audio_file, sr=16000)
-    # 2) Truncate to the first 30 seconds
-    time_limit_s = 6000
     if len(waveform) > sr * time_limit_s:
         waveform = waveform[: sr * time_limit_s]
-    # Also limit if total is over 60 min (safety)
-    max_audio_sec = 60 * 60
-    if len(waveform) > sr * max_audio_sec:
-        waveform = waveform[: sr * max_audio_sec]
-    # 3) Split that 30s portion into 5s chunks
-    chunk_duration_s = 25
-    chunk_size = sr * chunk_duration_s
-    chunks = []
-    for start_idx in range(0, len(waveform), chunk_size):
-        chunk = waveform[start_idx : start_idx + chunk_size]
-        if len(chunk) < sr * 1:
-            continue
-        chunks.append(chunk)
-    partial_text = ""
-    # 4) Transcribe chunk by chunk
-    for i, chunk in enumerate(chunks):
-        if stop_processing:
-            yield "⚠️ Stopped by User ⚠️"
-            return
-        inputs = processor(
-            chunk,
-            sampling_rate=16000,
-            return_tensors="pt",
-            padding="longest",
-            return_attention_mask=True
         )
-        input_features = inputs.input_features.to(device)
-        attention_mask = inputs.attention_mask.to(device)
-        with torch.no_grad():
-            predicted_ids = model.generate(
-                input_features,
-                attention_mask=attention_mask,
-                max_new_tokens=444,   # keep under total token limit
-                do_sample=False,
-                forced_decoder_ids=forced_decoder_ids
-            )
-        text_chunk = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        partial_text += text_chunk + "\n"
-        # Send updated partial text to the UI
-        yield partial_text
-# Build Gradio UI
-with gr.Blocks() as demo:
-    gr.Markdown("## Hebrew Whisper (Truncate to 30s, No Progress Bar)")
-    audio_input = gr.Audio(type="filepath", label="Upload Audio (Truncate to 30s)")
-    output_text = gr.Textbox(label="Partial Transcription")
-    start_btn = gr.Button("Start Transcription")
-    stop_btn = gr.Button("Stop Processing", variant="stop")
-    # Stream chunk-by-chunk, no progress bar
-    start_btn.click(transcribe_30_seconds, inputs=audio_input, outputs=output_text)
-    stop_btn.click(stop)
 demo.launch()

 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+forced_decoder_ids = processor.get_decoder_prompt_ids(language="he", task="transcribe")
+def transcribe_audio(audio_file):
+    """Process only the first 30 seconds of an audio file and return text."""
     waveform, sr = librosa.load(audio_file, sr=16000)
+    # Limit to first 30 seconds
+    time_limit_s = 30
     if len(waveform) > sr * time_limit_s:
         waveform = waveform[: sr * time_limit_s]
+    # Preprocess
+    inputs = processor(
+        waveform,
+        sampling_rate=16000,
+        return_tensors="pt",
+        padding="longest",
+        return_attention_mask=True
+    )
+    input_features = inputs.input_features.to(device)
+    attention_mask = inputs.attention_mask.to(device)
+    # Transcribe
+    with torch.no_grad():
+        predicted_ids = model.generate(
+            input_features,
+            attention_mask=attention_mask,
+            max_new_tokens=444,
+            do_sample=False,
+            forced_decoder_ids=forced_decoder_ids
         )
+    # Decode and return text
+    text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    return text
+# Expose API endpoint for Make.com
+demo = gr.Interface(
+    fn=transcribe_audio,
+    inputs=gr.Audio(type="filepath"),
+    outputs="text",
+    title="Hebrew Whisper API",
+    api_name="transcribe"  # This enables API access
+)
+# Run on Hugging Face Spaces
 demo.launch()