Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,104 +1,96 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
import librosa
|
4 |
-
import
|
5 |
-
from faster_whisper import WhisperModel
|
6 |
-
|
7 |
-
# -----------------------------
|
8 |
-
# 1. Load Faster Whisper Model
|
9 |
-
# -----------------------------
|
10 |
-
# * Use device="cuda" if a GPU is available, otherwise "cpu".
|
11 |
-
model = WhisperModel(
|
12 |
-
"ivrit-ai/whisper-large-v3-turbo-ct2",
|
13 |
-
device="cuda" if torch.cuda.is_available() else "cpu"
|
14 |
-
)
|
15 |
|
16 |
# --------------------------------
|
17 |
-
#
|
18 |
# --------------------------------
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
|
|
|
|
|
|
|
|
21 |
def stop():
|
22 |
-
"""Set a global stop flag, letting the user interrupt transcription."""
|
23 |
global stop_processing
|
24 |
stop_processing = True
|
25 |
|
26 |
-
#
|
27 |
-
# 3
|
28 |
-
#
|
29 |
def transcribe(audio_file):
|
30 |
-
"""
|
31 |
-
Transcribe Hebrew speech from an uploaded audio file using Faster Whisper.
|
32 |
-
Splits audio into ~2-minute chunks to handle very large files (up to 60 min).
|
33 |
-
"""
|
34 |
global stop_processing
|
35 |
-
stop_processing = False # Reset
|
36 |
|
37 |
-
# A) Load Audio
|
38 |
-
|
39 |
-
waveform, sr = librosa.load(audio_file, sr=sample_rate)
|
40 |
|
41 |
-
#
|
42 |
-
|
43 |
-
if len(waveform) > sr *
|
44 |
-
waveform = waveform[: sr *
|
45 |
|
46 |
-
# B) Split into ~2-
|
47 |
-
chunk_duration = 2 * 60 #
|
48 |
chunks = []
|
49 |
for start_idx in range(0, len(waveform), sr * chunk_duration):
|
50 |
if stop_processing:
|
51 |
return "⚠️ Transcription Stopped by User ⚠️"
|
52 |
|
53 |
chunk = waveform[start_idx : start_idx + sr * chunk_duration]
|
54 |
-
# Skip
|
55 |
if len(chunk) < sr * 2:
|
56 |
continue
|
57 |
chunks.append(chunk)
|
58 |
|
59 |
-
# C) Transcribe Each Chunk
|
60 |
-
|
61 |
for chunk in chunks:
|
62 |
if stop_processing:
|
63 |
return "⚠️ Transcription Stopped by User ⚠️"
|
64 |
|
65 |
-
#
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
)
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
#
|
88 |
-
# 4. Build Gradio Interface
|
89 |
-
# ---------------------------
|
90 |
with gr.Blocks() as demo:
|
91 |
-
gr.Markdown("## Hebrew
|
92 |
|
93 |
audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
|
94 |
-
|
95 |
|
96 |
start_btn = gr.Button("Start Transcription")
|
97 |
stop_btn = gr.Button("Stop Processing", variant="stop")
|
98 |
|
99 |
-
#
|
100 |
-
start_btn.click(transcribe, inputs=audio_input, outputs=
|
101 |
stop_btn.click(stop)
|
102 |
|
103 |
-
# Launch the
|
104 |
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
import librosa
|
4 |
+
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# --------------------------------
|
7 |
+
# 1) Load the Whisper Model & Processor
|
8 |
# --------------------------------
|
9 |
+
model_id = "ivrit-ai/whisper-large-v3-turbo"
|
10 |
+
processor = WhisperProcessor.from_pretrained(model_id)
|
11 |
+
model = WhisperForConditionalGeneration.from_pretrained(model_id)
|
12 |
+
|
13 |
+
# If GPU is available, use it
|
14 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
15 |
+
model.to(device)
|
16 |
|
17 |
+
# ---------------------------------------------------
|
18 |
+
# 2) A Global Stop Flag to Halt Mid-Transcription
|
19 |
+
# ---------------------------------------------------
|
20 |
+
stop_processing = False
|
21 |
def stop():
|
|
|
22 |
global stop_processing
|
23 |
stop_processing = True
|
24 |
|
25 |
+
# ---------------------------------------------------
|
26 |
+
# 3) The Main Transcription Function with Chunking
|
27 |
+
# ---------------------------------------------------
|
28 |
def transcribe(audio_file):
|
|
|
|
|
|
|
|
|
29 |
global stop_processing
|
30 |
+
stop_processing = False # Reset each new transcription
|
31 |
|
32 |
+
# A) Load Audio with Librosa @ 16 kHz
|
33 |
+
waveform, sr = librosa.load(audio_file, sr=16000)
|
|
|
34 |
|
35 |
+
# Limit audio to 60 minutes
|
36 |
+
max_audio_sec = 60 * 60
|
37 |
+
if len(waveform) > sr * max_audio_sec:
|
38 |
+
waveform = waveform[: sr * max_audio_sec]
|
39 |
|
40 |
+
# B) Split audio into ~2-minute chunks
|
41 |
+
chunk_duration = 2 * 60 # 120 seconds
|
42 |
chunks = []
|
43 |
for start_idx in range(0, len(waveform), sr * chunk_duration):
|
44 |
if stop_processing:
|
45 |
return "⚠️ Transcription Stopped by User ⚠️"
|
46 |
|
47 |
chunk = waveform[start_idx : start_idx + sr * chunk_duration]
|
48 |
+
# Skip super-short chunks (< 2 seconds), optional
|
49 |
if len(chunk) < sr * 2:
|
50 |
continue
|
51 |
chunks.append(chunk)
|
52 |
|
53 |
+
# C) Transcribe Each Chunk
|
54 |
+
transcriptions = []
|
55 |
for chunk in chunks:
|
56 |
if stop_processing:
|
57 |
return "⚠️ Transcription Stopped by User ⚠️"
|
58 |
|
59 |
+
# Prepare chunk for Whisper
|
60 |
+
inputs = processor(
|
61 |
+
chunk, sampling_rate=16000, return_tensors="pt", language="he"
|
62 |
+
).input_features.to(device)
|
63 |
+
|
64 |
+
# Generate IDs
|
65 |
+
with torch.no_grad():
|
66 |
+
predicted_ids = model.generate(
|
67 |
+
inputs,
|
68 |
+
max_new_tokens=448, # or 444 if you prefer
|
69 |
+
do_sample=False, # deterministic
|
70 |
+
)
|
71 |
+
|
72 |
+
# Decode
|
73 |
+
text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
74 |
+
transcriptions.append(text)
|
75 |
+
|
76 |
+
# D) Combine Final Output
|
77 |
+
return " ".join(transcriptions)
|
78 |
+
|
79 |
+
# ------------------------------
|
80 |
+
# 4) Create a Gradio Interface
|
81 |
+
# ------------------------------
|
|
|
|
|
82 |
with gr.Blocks() as demo:
|
83 |
+
gr.Markdown("## Hebrew Whisper: ivrit-ai/whisper-large-v3-turbo")
|
84 |
|
85 |
audio_input = gr.Audio(type="filepath", label="Upload Hebrew Audio")
|
86 |
+
output_text = gr.Textbox(label="Transcription Output")
|
87 |
|
88 |
start_btn = gr.Button("Start Transcription")
|
89 |
stop_btn = gr.Button("Stop Processing", variant="stop")
|
90 |
|
91 |
+
# Button Actions
|
92 |
+
start_btn.click(transcribe, inputs=audio_input, outputs=output_text)
|
93 |
stop_btn.click(stop)
|
94 |
|
95 |
+
# Launch the App
|
96 |
demo.launch()
|