Spaces:

doyouknowmarc
/

Transcription

Runtime error

App Files Files Community

doyouknowmarc commited on Feb 6, 2025

Commit

c2cfda7

verified ·

1 Parent(s): 66fb3d7

Init - create App.py

Browse files

Files changed (1) hide show

app.py +405 -0

app.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import gradio as gr
+import warnings
+import torch
+import os
+import whisper
+import ssl
+import zipfile
+from pydub import AudioSegment
+from pydub.silence import detect_nonsilent
+import subprocess
+import tempfile
+import time
+ssl._create_default_https_context = ssl._create_unverified_context
+def process_audio(
+    audio_paths,
+    remove_silence=False,
+    min_silence_len=500,
+    silence_thresh=-50,
+    enable_chunking=False,
+    chunk_duration=600,
+    ffmpeg_path="ffmpeg",
+    model_size="large-v3-turbo",
+    language="de"
+):
+    try:
+        if not audio_paths:
+            return "No files selected.", "", None
+        # Clean up any existing temp directory at the start
+        temp_dir = "temp_processing"
+        if os.path.exists(temp_dir):
+            for file in os.listdir(temp_dir):
+                file_path = os.path.join(temp_dir, file)
+                try:
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+                except Exception as e:
+                    print(f"Error cleaning up {file_path}: {e}")
+            try:
+                os.rmdir(temp_dir)
+            except Exception as e:
+                print(f"Error removing temp directory: {e}")
+        # Create fresh temp directory with unique timestamp
+        temp_dir = f"temp_processing_{int(time.time())}"
+        os.makedirs(temp_dir, exist_ok=True)
+        processed_files = []
+        all_results = []
+        all_segments = []
+        all_txt_paths = []
+        try:
+            # Step 1: Process each audio file
+            for audio_path in audio_paths:
+                if not audio_path:
+                    continue
+                current_file = audio_path
+                temp_files = []
+                # Step 1a: Split audio if chunking is enabled
+                if enable_chunking:
+                    base_name = os.path.splitext(os.path.basename(current_file))[0]
+                    output_pattern = os.path.join(temp_dir, f"{base_name}_part_%d.mp3")
+                    cmd = [
+                        ffmpeg_path, "-i", current_file,
+                        "-f", "segment",
+                        "-segment_time", str(chunk_duration),
+                        "-c:a", "copy",
+                        "-segment_start_number", "1",
+                        output_pattern
+                    ]
+                    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                    chunk_files = sorted([os.path.join(temp_dir, f) for f in os.listdir(temp_dir)
+                                       if f.startswith(f"{base_name}_part_")])
+                    temp_files.extend(chunk_files)
+                else:
+                    temp_files.append(current_file)
+                # Step 1b: Remove silence if requested
+                if remove_silence:
+                    silence_removed_files = []
+                    for file in temp_files:
+                        audio = AudioSegment.from_file(file)
+                        nonsilent = detect_nonsilent(
+                            audio,
+                            min_silence_len=min_silence_len,
+                            silence_thresh=silence_thresh
+                        )
+                        output = AudioSegment.empty()
+                        for start, end in nonsilent:
+                            output += audio[start:end]
+                        # Save the silence-removed file
+                        silence_removed_path = os.path.join(temp_dir, f"silence_removed_{os.path.basename(file)}")
+                        output.export(silence_removed_path, format="mp3")
+                        silence_removed_files.append(silence_removed_path)
+                    processed_files.extend(silence_removed_files)
+                else:
+                    processed_files.extend(temp_files)
+            # Step 2: Transcribe all processed files
+            print(f"Loading Whisper model '{model_size}'...")
+            model = whisper.load_model(model_size, device="cpu")
+            for file in processed_files:
+                print(f"Transcribing: {file}")
+                warnings.filterwarnings("ignore", message="FP16 is not supported on CPU; using FP32 instead")
+                result = model.transcribe(file, fp16=False, language=language, temperature=0.0)
+                full_text = result["text"]
+                segments = ""
+                for segment in result["segments"]:
+                    segments += f"[{segment['start']:.2f} - {segment['end']:.2f}]: {segment['text']}\n"
+                # Store transcript files in temp directory
+                txt_path = os.path.join(temp_dir, f"transcript_{os.path.splitext(os.path.basename(file))[0]}.txt")
+                with open(txt_path, "w", encoding="utf-8") as f:
+                    f.write("=== Full Transcription ===\n\n")
+                    f.write(full_text)
+                    f.write("\n\n=== Segment-wise Transcription ===\n")
+                    f.write(segments)
+                all_results.append(full_text)
+                all_segments.append(segments)
+                all_txt_paths.append(txt_path)
+            # Create combined transcript file in temp directory
+            combined_txt_path = os.path.join(temp_dir, "combined_transcripts.txt")
+            with open(combined_txt_path, "w", encoding="utf-8") as f:
+                f.write("=== Combined Transcriptions ===\n\n")
+                for i, (result, segment, path) in enumerate(zip(all_results, all_segments, all_txt_paths)):
+                    filename = os.path.basename(processed_files[i])
+                    f.write(f"File: {filename}\n")
+                    f.write("=== Full Transcription ===\n")
+                    f.write(result)
+                    f.write("\n\n=== Segment-wise Transcription ===\n")
+                    f.write(segment)
+                    f.write("\n" + "-"*50 + "\n\n")
+            # Format display output
+            combined_results = "=== File Transcriptions ===\n\n"
+            combined_segments = "=== File Segments ===\n\n"
+            for i, (result, segment) in enumerate(zip(all_results, all_segments)):
+                filename = os.path.basename(processed_files[i])
+                combined_results += f"File: {filename}\n{result}\n\n"
+                combined_segments += f"File: {filename}\n{segment}\n\n"
+            # Create ZIP with all processed files and transcripts
+            zip_path = f"processed_files_and_transcripts_{int(time.time())}.zip"
+            cleanup_files = processed_files.copy()
+            with zipfile.ZipFile(zip_path, 'w') as zipf:
+                for file in processed_files:
+                    if os.path.exists(file):
+                        zipf.write(file, os.path.basename(file))
+                for txt_file in all_txt_paths:
+                    if os.path.exists(txt_file):
+                        zipf.write(txt_file)
+                if os.path.exists(combined_txt_path):
+                    zipf.write(combined_txt_path)
+            # Cleanup files after ZIP creation
+            for file in cleanup_files:
+                if os.path.exists(file):
+                    os.remove(file)
+            for txt_file in all_txt_paths:
+                if os.path.exists(txt_file):
+                    os.remove(txt_file)
+            if os.path.exists(combined_txt_path):
+                os.remove(combined_txt_path)
+            # Clean up temp directory
+            if os.path.exists(temp_dir):
+                for file in os.listdir(temp_dir):
+                    file_path = os.path.join(temp_dir, file)
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+                os.rmdir(temp_dir)
+            return combined_results, combined_segments, zip_path
+        except Exception as inner_e:
+            print(f"Error during processing: {inner_e}")
+            raise inner_e
+    except Exception as e:
+        print(f"Error in process_audio: {e}")
+        if 'temp_dir' in locals() and os.path.exists(temp_dir):
+            try:
+                for file in os.listdir(temp_dir):
+                    file_path = os.path.join(temp_dir, file)
+                    if os.path.isfile(file_path):
+                        os.remove(file_path)
+                os.rmdir(temp_dir)
+            except:
+                pass
+        return f"Error: {str(e)}", "", None
+def create_interface():
+    with gr.Blocks(title="Interview Audio Processing App") as app:
+        gr.Markdown("""
+        # Audio Processing App
+        Upload audio files (MP3 or M4A) for processing and transcription.\\
+        Intended use case: transcription of interviews.
+        """)
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.File(
+                    label="Upload Audio Files",
+                    file_count="multiple",
+                    type="filepath"
+                )
+                with gr.Group():
+                    gr.Markdown("###  Silence Removal Settings")
+                    gr.Markdown(" Default settings are working very well. Silence removal helps to reduce hallucination.")
+                    remove_silence = gr.Checkbox(
+                        label="Remove Silence",
+                        value=False
+                    )
+                    min_silence_len = gr.Slider(
+                        minimum=100,
+                        maximum=2000,
+                        value=500,
+                        step=100,
+                        label="Minimum Silence Length (ms)",
+                        visible=False
+                    )
+                    silence_thresh = gr.Slider(
+                        minimum=-70,
+                        maximum=-30,
+                        value=-50,
+                        step=5,
+                        label="Silence Threshold (dB)",
+                        visible=False
+                    )
+                with gr.Group():
+                    gr.Markdown("###  Chunking Settings")
+                    gr.Markdown(" Chunking reduces the load on the model. 10min chunks work really good.")
+                    enable_chunking = gr.Checkbox(
+                        label="Enable Chunking",
+                        value=False
+                    )
+                    chunk_duration = gr.Slider(
+                        minimum=60,
+                        maximum=3600,
+                        value=600,
+                        step=60,
+                        label="Chunk Duration (seconds)",
+                        visible=False
+                    )
+                    ffmpeg_path = gr.Textbox(
+                        label="FFmpeg Path",
+                        value="ffmpeg",
+                        placeholder="Path to ffmpeg executable",
+                        visible=False
+                    )
+                with gr.Group():
+                    gr.Markdown("###  Transcription Settings")
+                    gr.Markdown(" tiny is the fastest, but the worst quality. Large-v3-turbo is the best, but slower.")
+                    model_size = gr.Dropdown(
+                        choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3", "turbo", "large-v3-turbo"],
+                        value="large-v3-turbo",
+                        label="Whisper Model Size"
+                    )
+                    language = gr.Dropdown(
+                        choices=["de", "en", "fr", "es", "it"],
+                        value="de",
+                        label="Language"
+                    )
+                process_btn = gr.Button("Process", variant="primary")
+                delete_btn = gr.Button("Delete Everything", variant="stop")
+            with gr.Column():
+                full_transcription = gr.Textbox(label="Full Transcription", lines=15)
+                segmented_transcription = gr.Textbox(label="Segmented Transcription", lines=15)
+                download_output = gr.File(label="Download Processed Files and Transcripts (ZIP)")
+        def update_silence_controls(remove_silence):
+            return {
+                min_silence_len: gr.update(visible=remove_silence),
+                silence_thresh: gr.update(visible=remove_silence),
+                full_transcription: gr.update(value=""),
+                segmented_transcription: gr.update(value=""),
+                download_output: gr.update(value=None)
+            }
+        def update_chunking_controls(enable_chunking):
+            return {
+                chunk_duration: gr.update(visible=enable_chunking),
+                ffmpeg_path: gr.update(visible=enable_chunking),
+                full_transcription: gr.update(value=""),
+                segmented_transcription: gr.update(value=""),
+                download_output: gr.update(value=None)
+            }
+        remove_silence.change(
+            fn=update_silence_controls,
+            inputs=[remove_silence],
+            outputs=[
+                min_silence_len,
+                silence_thresh,
+                full_transcription,
+                segmented_transcription,
+                download_output
+            ]
+        )
+        enable_chunking.change(
+            fn=update_chunking_controls,
+            inputs=[enable_chunking],
+            outputs=[
+                chunk_duration,
+                ffmpeg_path,
+                full_transcription,
+                segmented_transcription,
+                download_output
+            ]
+        )
+        process_btn.click(
+            fn=process_audio,
+            inputs=[
+                audio_input,
+                remove_silence,
+                min_silence_len,
+                silence_thresh,
+                enable_chunking,
+                chunk_duration,
+                ffmpeg_path,
+                model_size,
+                language,
+            ],
+            outputs=[
+                full_transcription,
+                segmented_transcription,
+                download_output,
+            ]
+        )
+        # Add cleanup function
+        def cleanup_files():
+            try:
+                # Clean up temp directories
+                temp_dirs = [d for d in os.listdir('.') if d.startswith('temp_processing')]
+                for temp_dir in temp_dirs:
+                    if os.path.exists(temp_dir):
+                        for file in os.listdir(temp_dir):
+                            file_path = os.path.join(temp_dir, file)
+                            if os.path.isfile(file_path):
+                                os.remove(file_path)
+                        os.rmdir(temp_dir)
+                # Clean up ZIP files
+                zip_files = [f for f in os.listdir('.') if f.startswith('processed_files_and_transcripts_')]
+                for zip_file in zip_files:
+                    if os.path.exists(zip_file):
+                        os.remove(zip_file)
+                # Clean up transcript files
+                transcript_files = [f for f in os.listdir('.') if f.startswith('transcript_')]
+                for transcript_file in transcript_files:
+                    if os.path.exists(transcript_file):
+                        os.remove(transcript_file)
+                # Return updates for all output fields
+                return {
+                    full_transcription: gr.update(value="All temporary files have been deleted."),
+                    segmented_transcription: gr.update(value=""),
+                    download_output: gr.update(value=None)
+                }
+            except Exception as e:
+                return {
+                    full_transcription: gr.update(value=f"Error during cleanup: {str(e)}"),
+                    segmented_transcription: gr.update(value=""),
+                    download_output: gr.update(value=None)
+                }
+        # Update the delete button click handler
+        delete_btn.click(
+            fn=cleanup_files,
+            inputs=[],
+            outputs=[
+                full_transcription,
+                segmented_transcription,
+                download_output
+            ]
+        )
+    return app
+if __name__ == "__main__":
+    app = create_interface()
+    app.launch(share=False)