EladSpamson commited on
Commit
f34f2ca
·
verified ·
1 Parent(s): 2a0a17e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -23
app.py CHANGED
@@ -11,10 +11,7 @@ device = "cuda" if torch.cuda.is_available() else "cpu"
11
  model.to(device)
12
 
13
  # Force Hebrew transcription
14
- forced_decoder_ids = processor.get_decoder_prompt_ids(
15
- language="he",
16
- task="transcribe"
17
- )
18
 
19
  stop_processing = False
20
  def stop():
@@ -22,40 +19,30 @@ def stop():
22
  stop_processing = True
23
 
24
  def transcribe_audio(audio_file):
25
- """
26
- Process up to 3600 seconds of the audio (1 hour) in 25-second chunks.
27
- Return partial text chunk by chunk (generator).
28
- """
29
  global stop_processing
30
  stop_processing = False
31
 
32
- # 1) Load at 16kHz
33
  waveform, sr = librosa.load(audio_file, sr=16000)
34
-
35
- # 2) Truncate to the first 3600 seconds (1 hour)
36
- time_limit_s = 3600
37
  if len(waveform) > sr * time_limit_s:
38
  waveform = waveform[: sr * time_limit_s]
39
 
40
- # Additional safety limit if total is somehow over 60 minutes
41
  max_audio_sec = 60 * 60
42
  if len(waveform) > sr * max_audio_sec:
43
  waveform = waveform[: sr * max_audio_sec]
44
 
45
- # 3) Split that audio into 25-second chunks
46
  chunk_duration_s = 25
47
  chunk_size = sr * chunk_duration_s
48
  chunks = []
49
  for start_idx in range(0, len(waveform), chunk_size):
50
  chunk = waveform[start_idx : start_idx + chunk_size]
51
- # Skip very small final pieces (less than 1 second)
52
  if len(chunk) < sr * 1:
53
  continue
54
  chunks.append(chunk)
55
 
56
  partial_text = ""
57
-
58
- # 4) Transcribe chunk by chunk
59
  for i, chunk in enumerate(chunks):
60
  if stop_processing:
61
  yield "⚠️ Stopped by User ⚠️"
@@ -75,7 +62,7 @@ def transcribe_audio(audio_file):
75
  predicted_ids = model.generate(
76
  input_features,
77
  attention_mask=attention_mask,
78
- max_new_tokens=444, # keep under total token limit
79
  do_sample=False,
80
  forced_decoder_ids=forced_decoder_ids
81
  )
@@ -83,10 +70,8 @@ def transcribe_audio(audio_file):
83
  text_chunk = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
84
  partial_text += text_chunk + "\n"
85
 
86
- # Stream updated partial text
87
  yield partial_text
88
 
89
- # Build Gradio UI
90
  with gr.Blocks() as demo:
91
  gr.Markdown("## Hebrew Whisper (up to 1 hour, 25-second chunks)")
92
 
@@ -96,9 +81,8 @@ with gr.Blocks() as demo:
96
  start_btn = gr.Button("Start Transcription")
97
  stop_btn = gr.Button("Stop Processing", variant="stop")
98
 
99
- # Stream chunk-by-chunk without a progress bar
100
  start_btn.click(transcribe_audio, inputs=audio_input, outputs=output_text)
101
  stop_btn.click(stop)
102
 
103
- # Enable API by setting enable_api=True
104
- demo.launch(enable_api=True)
 
11
  model.to(device)
12
 
13
  # Force Hebrew transcription
14
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="he", task="transcribe")
 
 
 
15
 
16
  stop_processing = False
17
  def stop():
 
19
  stop_processing = True
20
 
21
  def transcribe_audio(audio_file):
 
 
 
 
22
  global stop_processing
23
  stop_processing = False
24
 
 
25
  waveform, sr = librosa.load(audio_file, sr=16000)
26
+ time_limit_s = 3600 # 1 hour
 
 
27
  if len(waveform) > sr * time_limit_s:
28
  waveform = waveform[: sr * time_limit_s]
29
 
30
+ # Safety limit (just in case)
31
  max_audio_sec = 60 * 60
32
  if len(waveform) > sr * max_audio_sec:
33
  waveform = waveform[: sr * max_audio_sec]
34
 
35
+ # Split into 25-second chunks
36
  chunk_duration_s = 25
37
  chunk_size = sr * chunk_duration_s
38
  chunks = []
39
  for start_idx in range(0, len(waveform), chunk_size):
40
  chunk = waveform[start_idx : start_idx + chunk_size]
 
41
  if len(chunk) < sr * 1:
42
  continue
43
  chunks.append(chunk)
44
 
45
  partial_text = ""
 
 
46
  for i, chunk in enumerate(chunks):
47
  if stop_processing:
48
  yield "⚠️ Stopped by User ⚠️"
 
62
  predicted_ids = model.generate(
63
  input_features,
64
  attention_mask=attention_mask,
65
+ max_new_tokens=444,
66
  do_sample=False,
67
  forced_decoder_ids=forced_decoder_ids
68
  )
 
70
  text_chunk = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
71
  partial_text += text_chunk + "\n"
72
 
 
73
  yield partial_text
74
 
 
75
  with gr.Blocks() as demo:
76
  gr.Markdown("## Hebrew Whisper (up to 1 hour, 25-second chunks)")
77
 
 
81
  start_btn = gr.Button("Start Transcription")
82
  stop_btn = gr.Button("Stop Processing", variant="stop")
83
 
 
84
  start_btn.click(transcribe_audio, inputs=audio_input, outputs=output_text)
85
  stop_btn.click(stop)
86
 
87
+ # Replace `enable_api=True` with `api_open=True`
88
+ demo.launch(api_open=True)