Update app.py
Browse files
app.py
CHANGED
@@ -88,27 +88,15 @@ def build_html_output(s: str, style: str = "result_item_success"):
|
|
88 |
</div>
|
89 |
"""
|
90 |
|
91 |
-
def create_preview_player(
|
92 |
"""Create an HTML preview with audio player and subtitles"""
|
93 |
-
|
94 |
-
static_dir.mkdir(exist_ok=True)
|
95 |
-
|
96 |
-
# Copy files to static directory with friendly names
|
97 |
-
audio_filename = Path(audio_path).name
|
98 |
-
vtt_filename = Path(vtt_path).name
|
99 |
-
new_audio_path = static_dir / audio_filename
|
100 |
-
new_vtt_path = static_dir / vtt_filename
|
101 |
-
|
102 |
-
shutil.copy2(audio_path, new_audio_path)
|
103 |
-
shutil.copy2(vtt_path, new_vtt_path)
|
104 |
-
|
105 |
-
# Create direct HTML content
|
106 |
html_content = f"""
|
107 |
<div class="player-container">
|
108 |
<h3>Audio Player with Subtitles</h3>
|
109 |
<audio controls>
|
110 |
-
<source src="
|
111 |
-
<track label="Tibetan" kind="subtitles" srclang="bo" src="
|
112 |
Your browser does not support the audio element.
|
113 |
</audio>
|
114 |
</div>
|
@@ -123,72 +111,89 @@ def process_audio(audio_path: str):
|
|
123 |
"Please upload an audio file first",
|
124 |
"result_item_error",
|
125 |
),
|
126 |
-
|
127 |
-
|
128 |
"",
|
129 |
"",
|
130 |
)
|
131 |
|
132 |
logging.info(f"Processing audio file: {audio_path}")
|
133 |
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
|
141 |
-
# Get speech timestamps using Silero VAD
|
142 |
-
speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
|
143 |
-
if not speech_timestamps:
|
144 |
return (
|
145 |
-
build_html_output(
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
"",
|
149 |
"",
|
150 |
)
|
151 |
|
152 |
-
timestamps_with_text = []
|
153 |
-
transcriptions = []
|
154 |
-
|
155 |
-
for ts in speech_timestamps:
|
156 |
-
start, end = ts['start'], ts['end']
|
157 |
-
segment = wav[start:end]
|
158 |
-
if segment.dim() > 1:
|
159 |
-
segment = segment.squeeze()
|
160 |
-
|
161 |
-
inputs = processor(segment, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
|
162 |
-
with torch.no_grad():
|
163 |
-
logits = model(**inputs).logits
|
164 |
-
predicted_ids = torch.argmax(logits, dim=-1)
|
165 |
-
transcription = processor.decode(predicted_ids[0])
|
166 |
-
transcriptions.append(transcription)
|
167 |
-
timestamps_with_text.append((start, end, transcription))
|
168 |
-
|
169 |
-
# Generate subtitle files
|
170 |
-
base_path = os.path.splitext(audio_path)[0]
|
171 |
-
srt_path = f"{base_path}.srt"
|
172 |
-
vtt_path = f"{base_path}.vtt"
|
173 |
-
|
174 |
-
create_subtitle_file(timestamps_with_text, srt_path, "srt")
|
175 |
-
create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
|
176 |
-
|
177 |
-
# Create preview player
|
178 |
-
preview_html = create_preview_player(audio_path, vtt_path)
|
179 |
-
all_text = " ".join(transcriptions)
|
180 |
-
|
181 |
-
return (
|
182 |
-
build_html_output(
|
183 |
-
"Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
|
184 |
-
"result_item_success"
|
185 |
-
),
|
186 |
-
srt_path,
|
187 |
-
vtt_path,
|
188 |
-
preview_html,
|
189 |
-
all_text,
|
190 |
-
)
|
191 |
-
|
192 |
demo = gr.Blocks(css=css)
|
193 |
|
194 |
with demo:
|
@@ -231,4 +236,4 @@ with demo:
|
|
231 |
if __name__ == "__main__":
|
232 |
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
233 |
logging.basicConfig(format=formatter, level=logging.INFO)
|
234 |
-
demo.launch(share=True
|
|
|
88 |
</div>
|
89 |
"""
|
90 |
|
91 |
+
def create_preview_player(audio_file, vtt_file):
|
92 |
"""Create an HTML preview with audio player and subtitles"""
|
93 |
+
# Create direct HTML content using the file components directly
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
html_content = f"""
|
95 |
<div class="player-container">
|
96 |
<h3>Audio Player with Subtitles</h3>
|
97 |
<audio controls>
|
98 |
+
<source src="{audio_file.name}" type="audio/wav">
|
99 |
+
<track label="Tibetan" kind="subtitles" srclang="bo" src="{vtt_file.name}" default>
|
100 |
Your browser does not support the audio element.
|
101 |
</audio>
|
102 |
</div>
|
|
|
111 |
"Please upload an audio file first",
|
112 |
"result_item_error",
|
113 |
),
|
114 |
+
None,
|
115 |
+
None,
|
116 |
"",
|
117 |
"",
|
118 |
)
|
119 |
|
120 |
logging.info(f"Processing audio file: {audio_path}")
|
121 |
|
122 |
+
try:
|
123 |
+
# Load and resample audio to 16kHz mono
|
124 |
+
wav, sr = torchaudio.load(audio_path)
|
125 |
+
if sr != SAMPLE_RATE:
|
126 |
+
wav = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(wav)
|
127 |
+
wav = wav.mean(dim=0) # convert to mono
|
128 |
+
wav_np = wav.numpy()
|
129 |
+
|
130 |
+
# Get speech timestamps using Silero VAD
|
131 |
+
speech_timestamps = get_speech_ts(wav_np, vad_model, sampling_rate=SAMPLE_RATE)
|
132 |
+
if not speech_timestamps:
|
133 |
+
return (
|
134 |
+
build_html_output("No speech detected", "result_item_error"),
|
135 |
+
None,
|
136 |
+
None,
|
137 |
+
"",
|
138 |
+
"",
|
139 |
+
)
|
140 |
+
|
141 |
+
timestamps_with_text = []
|
142 |
+
transcriptions = []
|
143 |
+
|
144 |
+
for ts in speech_timestamps:
|
145 |
+
start, end = ts['start'], ts['end']
|
146 |
+
segment = wav[start:end]
|
147 |
+
if segment.dim() > 1:
|
148 |
+
segment = segment.squeeze()
|
149 |
+
|
150 |
+
inputs = processor(segment, sampling_rate=SAMPLE_RATE, return_tensors="pt", padding=True)
|
151 |
+
with torch.no_grad():
|
152 |
+
logits = model(**inputs).logits
|
153 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
154 |
+
transcription = processor.decode(predicted_ids[0])
|
155 |
+
transcriptions.append(transcription)
|
156 |
+
timestamps_with_text.append((start, end, transcription))
|
157 |
+
|
158 |
+
# Generate subtitle files
|
159 |
+
base_path = os.path.splitext(audio_path)[0]
|
160 |
+
srt_path = f"{base_path}.srt"
|
161 |
+
vtt_path = f"{base_path}.vtt"
|
162 |
+
|
163 |
+
create_subtitle_file(timestamps_with_text, srt_path, "srt")
|
164 |
+
create_subtitle_file(timestamps_with_text, vtt_path, "vtt")
|
165 |
+
|
166 |
+
# Create file components for Gradio
|
167 |
+
srt_file = gr.File.update(value=srt_path)
|
168 |
+
vtt_file = gr.File.update(value=vtt_path)
|
169 |
+
|
170 |
+
# Create preview player
|
171 |
+
preview_html = create_preview_player(srt_file, vtt_file)
|
172 |
+
all_text = " ".join(transcriptions)
|
173 |
|
|
|
|
|
|
|
174 |
return (
|
175 |
+
build_html_output(
|
176 |
+
"Transcription completed! You can now:\n1. Download the SRT/VTT files\n2. Play the audio with subtitles below",
|
177 |
+
"result_item_success"
|
178 |
+
),
|
179 |
+
srt_file,
|
180 |
+
vtt_file,
|
181 |
+
preview_html,
|
182 |
+
all_text,
|
183 |
+
)
|
184 |
+
except Exception as e:
|
185 |
+
logging.error(f"Error processing audio: {str(e)}")
|
186 |
+
return (
|
187 |
+
build_html_output(
|
188 |
+
f"Error processing audio: {str(e)}",
|
189 |
+
"result_item_error"
|
190 |
+
),
|
191 |
+
None,
|
192 |
+
None,
|
193 |
"",
|
194 |
"",
|
195 |
)
|
196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
demo = gr.Blocks(css=css)
|
198 |
|
199 |
with demo:
|
|
|
236 |
if __name__ == "__main__":
|
237 |
formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
|
238 |
logging.basicConfig(format=formatter, level=logging.INFO)
|
239 |
+
demo.launch(share=True)
|