EladSpamson commited on
Commit
8be8710
·
verified ·
1 Parent(s): c0ea370

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -80
app.py CHANGED
@@ -10,93 +10,50 @@ model = WhisperForConditionalGeneration.from_pretrained(model_id)
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
  model.to(device)
12
 
13
- # Force Hebrew transcription
14
- forced_decoder_ids = processor.get_decoder_prompt_ids(
15
- language="he",
16
- task="transcribe"
17
- )
18
-
19
- stop_processing = False
20
- def stop():
21
- global stop_processing
22
- stop_processing = True
23
 
24
- def transcribe_30_seconds(audio_file):
25
- """
26
- Process only the first 30 seconds of the audio, in small 5-second chunks.
27
- Return partial text chunk by chunk (generator).
28
- """
29
- global stop_processing
30
- stop_processing = False
31
-
32
- # 1) Load at 16kHz
33
  waveform, sr = librosa.load(audio_file, sr=16000)
34
 
35
- # 2) Truncate to the first 30 seconds
36
- time_limit_s = 6000
37
  if len(waveform) > sr * time_limit_s:
38
  waveform = waveform[: sr * time_limit_s]
39
 
40
- # Also limit if total is over 60 min (safety)
41
- max_audio_sec = 60 * 60
42
- if len(waveform) > sr * max_audio_sec:
43
- waveform = waveform[: sr * max_audio_sec]
44
-
45
- # 3) Split that 30s portion into 5s chunks
46
- chunk_duration_s = 25
47
- chunk_size = sr * chunk_duration_s
48
- chunks = []
49
- for start_idx in range(0, len(waveform), chunk_size):
50
- chunk = waveform[start_idx : start_idx + chunk_size]
51
- if len(chunk) < sr * 1:
52
- continue
53
- chunks.append(chunk)
54
-
55
- partial_text = ""
56
-
57
- # 4) Transcribe chunk by chunk
58
- for i, chunk in enumerate(chunks):
59
- if stop_processing:
60
- yield "⚠️ Stopped by User ⚠️"
61
- return
62
-
63
- inputs = processor(
64
- chunk,
65
- sampling_rate=16000,
66
- return_tensors="pt",
67
- padding="longest",
68
- return_attention_mask=True
69
  )
70
- input_features = inputs.input_features.to(device)
71
- attention_mask = inputs.attention_mask.to(device)
72
 
73
- with torch.no_grad():
74
- predicted_ids = model.generate(
75
- input_features,
76
- attention_mask=attention_mask,
77
- max_new_tokens=444, # keep under total token limit
78
- do_sample=False,
79
- forced_decoder_ids=forced_decoder_ids
80
- )
81
-
82
- text_chunk = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
83
- partial_text += text_chunk + "\n"
84
-
85
- # Send updated partial text to the UI
86
- yield partial_text
87
-
88
- # Build Gradio UI
89
- with gr.Blocks() as demo:
90
- gr.Markdown("## Hebrew Whisper (Truncate to 30s, No Progress Bar)")
91
-
92
- audio_input = gr.Audio(type="filepath", label="Upload Audio (Truncate to 30s)")
93
- output_text = gr.Textbox(label="Partial Transcription")
94
-
95
- start_btn = gr.Button("Start Transcription")
96
- stop_btn = gr.Button("Stop Processing", variant="stop")
97
-
98
- # Stream chunk-by-chunk, no progress bar
99
- start_btn.click(transcribe_30_seconds, inputs=audio_input, outputs=output_text)
100
- stop_btn.click(stop)
101
 
 
102
  demo.launch()
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
  model.to(device)
12
 
13
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="he", task="transcribe")
 
 
 
 
 
 
 
 
 
14
 
15
+ def transcribe_audio(audio_file):
16
+ """Process only the first 30 seconds of an audio file and return text."""
 
 
 
 
 
 
 
17
  waveform, sr = librosa.load(audio_file, sr=16000)
18
 
19
+ # Limit to first 30 seconds
20
+ time_limit_s = 30
21
  if len(waveform) > sr * time_limit_s:
22
  waveform = waveform[: sr * time_limit_s]
23
 
24
+ # Preprocess
25
+ inputs = processor(
26
+ waveform,
27
+ sampling_rate=16000,
28
+ return_tensors="pt",
29
+ padding="longest",
30
+ return_attention_mask=True
31
+ )
32
+ input_features = inputs.input_features.to(device)
33
+ attention_mask = inputs.attention_mask.to(device)
34
+
35
+ # Transcribe
36
+ with torch.no_grad():
37
+ predicted_ids = model.generate(
38
+ input_features,
39
+ attention_mask=attention_mask,
40
+ max_new_tokens=444,
41
+ do_sample=False,
42
+ forced_decoder_ids=forced_decoder_ids
 
 
 
 
 
 
 
 
 
 
43
  )
 
 
44
 
45
+ # Decode and return text
46
+ text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
47
+ return text
48
+
49
+ # Expose API endpoint for Make.com
50
+ demo = gr.Interface(
51
+ fn=transcribe_audio,
52
+ inputs=gr.Audio(type="filepath"),
53
+ outputs="text",
54
+ title="Hebrew Whisper API",
55
+ api_name="transcribe" # This enables API access
56
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ # Run on Hugging Face Spaces
59
  demo.launch()