Spaces:

yentinglin
/

audio_demo

Sleeping

App Files Files Community

yentinglin commited on 23 days ago

Commit

7608be3

verified ·

1 Parent(s): 563150c

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -36

app.py CHANGED Viewed

@@ -11,6 +11,46 @@ from pyannote.audio import Pipeline
 from huggingface_hub import HfApi
 from torchaudio import functional as F # For resampling and audio processing
 # Set up logging
 logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@@ -27,14 +67,14 @@ logger = logging.getLogger(__name__)
 HF_TOKEN = os.getenv("HF_TOKEN")
 # Model names
-ASR_MODEL = "openai/whisper-small" # Smaller, faster Whisper model for demo
 DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
 # Speculative decoding (assistant model) is explicitly excluded as per requirements.
 # --- Inference Configuration (Pydantic Model for validation) ---
 class InferenceConfig(BaseModel):
     task: Literal["transcribe", "translate"] = "transcribe"
-    batch_size: int = 24
     chunk_length_s: int = 30
     language: Optional[str] = None
     num_speakers: Optional[int] = None
@@ -210,29 +250,46 @@ def post_process_segments_and_transcripts(combined_diarization_segments: list, a
         diar_end = diar_segment["segment"]["end"]
         speaker = diar_segment["speaker"]
-        # Find the index in `current_asr_end_timestamps` whose value is closest to `diar_end`.
-        # This `upto_idx_relative` determines how many ASR chunks from `current_asr_chunks`
-        # will be associated with the current `diar_segment`.
         upto_idx_relative = np.argmin(np.abs(current_asr_end_timestamps - diar_end))
-        # Select the ASR chunks up to and including this `upto_idx_relative`.
         chunks_for_this_diar_segment = current_asr_chunks[:upto_idx_relative + 1]
         if not chunks_for_this_diar_segment:
-            continue # No ASR chunks found for this diarization segment, skip
-        # Combine the text from the selected ASR chunks.
-        combined_text = "".join([chunk["text"] for chunk in chunks_for_this_diar_segment]).strip()
-        # Determine the start and end timestamp for the combined ASR text.
-        # This will be the min start and max end of the involved ASR chunks.
-        asr_min_start = min(chunk["timestamp"][0] for chunk in chunks_for_this_diar_segment if chunk["timestamp"][0] is not None)
-        asr_max_end = max(chunk["timestamp"][1] for chunk in chunks_for_this_diar_segment if chunk["timestamp"][1] is not None)
-        # Final timestamp for the output segment should be clamped by the diarization segment's boundaries
-        # to ensure it doesn't extend beyond what the diarizer indicated.
-        final_segment_start = max(diar_start, asr_min_start)
-        final_segment_end = min(diar_end, asr_max_end)
         final_segmented_transcript.append(
             {
@@ -242,12 +299,13 @@ def post_process_segments_and_transcripts(combined_diarization_segments: list, a
             }
         )
-        # Remove the processed ASR chunks from the lists for the next iteration.
         current_asr_chunks = current_asr_chunks[upto_idx_relative + 1:]
         current_asr_end_timestamps = current_asr_end_timestamps[upto_idx_relative + 1:]
     return final_segmented_transcript
 def diarize_and_align_transcript(diarization_pipeline: Pipeline, original_sampling_rate: int,
                                  audio_numpy_array: np.ndarray, parameters: InferenceConfig, asr_outputs: dict) -> list:
     """
@@ -303,12 +361,12 @@ def predict_audio(
         - status_message: A message indicating success or failure.
     """
     if audio_file_tuple is None:
-        return "", "", "Please upload an audio file."
     sampling_rate, audio_numpy_array = audio_file_tuple
     if audio_numpy_array is None or audio_numpy_array.size == 0:
-        return "", "", "Audio file is empty. Please upload a valid audio."
     # Ensure audio_numpy_array is float32 as expected by transformers pipeline
     if audio_numpy_array.dtype != np.float32:
@@ -318,19 +376,34 @@ def predict_audio(
     if len(audio_numpy_array.shape) > 1:
         audio_numpy_array = audio_numpy_array[:, 0]
     # Create an InferenceConfig object from Gradio inputs for internal validation and use.
     try:
         parameters = InferenceConfig(
             batch_size=batch_size,
             chunk_length_s=chunk_length_s,
             language=language if language != "Auto-detect" else None, # Convert "Auto-detect" to None for model
-            num_speakers=num_speakers,
-            min_speakers=min_speakers,
-            max_speakers=max_speakers,
         )
     except Exception as e:
         logger.error(f"Error validating parameters: {e}")
-        return "", "", f"Error validating input parameters: {e}"
     logger.info(f"Inference parameters: {parameters.model_dump_json()}")
     logger.info(f"Audio sampling rate: {sampling_rate} Hz, Audio shape: {audio_numpy_array.shape}")
@@ -339,7 +412,14 @@ def predict_audio(
     diarization_pipeline = models.get("diarization_pipeline")
     if not asr_pipeline:
-        return "", "", "ASR model not loaded. Please restart the application."
     # Prepare ASR generation arguments
     generate_kwargs = {
@@ -357,12 +437,12 @@ def predict_audio(
             batch_size=parameters.batch_size,
             generate_kwargs=generate_kwargs,
             return_timestamps=True,
-            #sampling_rate=sampling_rate # Pass original sampling rate to pipeline
         )
         logger.info("ASR inference completed.")
     except Exception as e:
         logger.error(f"ASR inference error: {str(e)}")
-        return "", "", f"ASR inference error: {str(e)}"
     final_transcript_data = []
     status_message = ""
@@ -426,12 +506,12 @@ demo = gr.Interface(
     fn=predict_audio,
     inputs=[
         gr.Audio(type="numpy", label="Upload Audio File (WAV, MP3, FLAC, etc.)"),
-        gr.Slider(minimum=1, maximum=32, value=24, step=1, label="ASR Batch Size"),
-        gr.Slider(minimum=1, maximum=60, value=30, step=1, label="ASR Chunk Length (seconds)"),
-        gr.Dropdown(WHISPER_LANGUAGES, value="Auto-detect", label="ASR Language"),
-        gr.Number(label="Diarization: Number of Speakers (optional)", value=None, precision=0, info="Expected total number of speakers."),
-        gr.Number(label="Diarization: Min Speakers (optional)", value=None, precision=0, info="Minimum number of speakers to detect."),
-        gr.Number(label="Diarization: Max Speakers (optional)", value=None, precision=0, info="Maximum number of speakers to detect.")
     ],
     outputs=[
         gr.Textbox(label="Diarized Transcript", lines=10, interactive=False),
@@ -447,13 +527,15 @@ demo = gr.Interface(
         "<br><b>Note:</b> For long audios or high concurrent usage, consider using a GPU and models like `whisper-large-v3`."
     ),
     allow_flagging="never", # Disable Gradio flagging feature
-    # Example audio path assumes you are running from the cloned repository root.
-    # If not, download a small WAV file (e.g., from Common Voice) and update this path.
     examples=[
         [os.path.join(os.path.dirname(__file__), "model-server", "app", "tests", "polyai-minds14-0.wav"), 24, 30, "Auto-detect", None, None, None]
     ],
-    cache_examples=False,
 )
 if __name__ == "__main__":
     demo.launch()

 from huggingface_hub import HfApi
 from torchaudio import functional as F # For resampling and audio processing
+# To run this Gradio demo, first ensure you have Python 3.9+ installed.
+# Then, create a virtual environment and install the required packages.
+#
+# 1. Create and activate a virtual environment (recommended):
+#    python3 -m venv venv
+#    source venv/bin/activate  # On Linux/macOS
+#    venv\Scripts\activate     # On Windows
+#
+# 2. Install the necessary packages:
+#    pip install gradio==4.20.1 \
+#                torch==2.2.1 \
+#                torchaudio==2.2.1 \
+#                transformers==4.38.2 \
+#                pyannote-audio==3.1.1 \
+#                numpy==1.26.4 \
+#                fastapi==0.110.0 \
+#                uvicorn==0.27.1 \
+#                python-multipart==0.0.9 \
+#                pydantic==2.6.1 \
+#                soundfile==0.12.1 # Required by torchaudio and pyannote for certain audio formats
+#
+#    # If you have a CUDA-compatible GPU, install the CUDA version of PyTorch instead:
+#    # For CUDA 12.1 (adjust 'cu121' to your CUDA version, e.g., 'cu118' for CUDA 11.8):
+#    # pip install torch==2.2.1 torchaudio==2.2.1 --index-url https://download.pytorch.org/whl/cu121
+#
+# 3. Set your Hugging Face Token (required for pyannote/speaker-diarization-3.1).
+#    You must accept the user conditions on the model page: https://huggingface.co/pyannote/speaker-diarization-3.1
+#    export HF_TOKEN="hf_YOUR_TOKEN_HERE"
+#    # Or directly in the script (not recommended for security):
+#    # HF_TOKEN = "hf_YOUR_TOKEN_HERE"
+#
+# 4. Save this file as, for example, `app.py`.
+#
+# 5. Run the application using uvicorn (as requested):
+#    uvicorn app:demo --host 0.0.0.0 --port 7860
+#
+#    # If you just want to run it via Python script (Gradio's default, without uvicorn directly):
+#    # python app.py
 # Set up logging
 logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 HF_TOKEN = os.getenv("HF_TOKEN")
 # Model names
+ASR_MODEL = "openai/whisper-large-v3-turbo" # Smaller, faster Whisper model for demo
 DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
 # Speculative decoding (assistant model) is explicitly excluded as per requirements.
 # --- Inference Configuration (Pydantic Model for validation) ---
 class InferenceConfig(BaseModel):
     task: Literal["transcribe", "translate"] = "transcribe"
+    batch_size: int = 1
     chunk_length_s: int = 30
     language: Optional[str] = None
     num_speakers: Optional[int] = None
         diar_end = diar_segment["segment"]["end"]
         speaker = diar_segment["speaker"]
+        # Find the index of the ASR chunk whose end timestamp is closest to diar_end
+        # Ensure argmin operates on a non-empty array
+        if current_asr_end_timestamps.size == 0:
+            logger.warning("No ASR end timestamps left to align with diarization segment. Breaking alignment.")
+            break # No more ASR chunks to align
         upto_idx_relative = np.argmin(np.abs(current_asr_end_timestamps - diar_end))
         chunks_for_this_diar_segment = current_asr_chunks[:upto_idx_relative + 1]
         if not chunks_for_this_diar_segment:
+            logger.warning(f"No ASR chunks selected for diarization segment [{diar_start:.2f}-{diar_end:.2f}] {speaker}. Skipping.")
+            continue
+        # Initialize with extreme values to find min/max correctly, handling None timestamps
+        asr_min_start_val = float('inf')
+        asr_max_end_val = float('-inf')
+        all_text = []
+        for chunk in chunks_for_this_diar_segment:
+            all_text.append(chunk["text"])
+            if chunk["timestamp"] and chunk["timestamp"][0] is not None:
+                asr_min_start_val = min(asr_min_start_val, chunk["timestamp"][0])
+            if chunk["timestamp"] and chunk["timestamp"][1] is not None:
+                asr_max_end_val = max(asr_max_end_val, chunk["timestamp"][1])
+        combined_text = "".join(all_text).strip()
+        # If no valid timestamps were found in the selected ASR chunks, fall back to diarization segment's bounds
+        if asr_min_start_val == float('inf'):
+            logger.warning(f"No valid start timestamps in ASR chunks for segment [{diar_start:.2f}-{diar_end:.2f}] {speaker}. Using diarization start.")
+            asr_min_start_val = diar_start
+        if asr_max_end_val == float('-inf'):
+            logger.warning(f"No valid end timestamps in ASR chunks for segment [{diar_start:.2f}-{diar_end:.2f}] {speaker}. Using diarization end.")
+            asr_max_end_val = diar_end
+        # Ensure final timestamp range makes sense and is clamped by diarization segment
+        final_segment_start = max(diar_start, asr_min_start_val)
+        final_segment_end = min(diar_end, asr_max_end_val)
         final_segmented_transcript.append(
             {
             }
         )
+        # Crop the transcripts and timestamp lists according to the latest timestamp
         current_asr_chunks = current_asr_chunks[upto_idx_relative + 1:]
         current_asr_end_timestamps = current_asr_end_timestamps[upto_idx_relative + 1:]
     return final_segmented_transcript
 def diarize_and_align_transcript(diarization_pipeline: Pipeline, original_sampling_rate: int,
                                  audio_numpy_array: np.ndarray, parameters: InferenceConfig, asr_outputs: dict) -> list:
     """
         - status_message: A message indicating success or failure.
     """
     if audio_file_tuple is None:
+        return "", "", gr.Warning("Please upload an audio file.")
     sampling_rate, audio_numpy_array = audio_file_tuple
     if audio_numpy_array is None or audio_numpy_array.size == 0:
+        return "", "", gr.Warning("Audio file is empty. Please upload a valid audio.")
     # Ensure audio_numpy_array is float32 as expected by transformers pipeline
     if audio_numpy_array.dtype != np.float32:
     if len(audio_numpy_array.shape) > 1:
         audio_numpy_array = audio_numpy_array[:, 0]
+    # Process speaker parameters: convert 0 or negative values to None for pyannote compatibility
+    processed_num_speakers = num_speakers if num_speakers is not None and num_speakers > 0 else None
+    processed_min_speakers = min_speakers if min_speakers is not None and min_speakers > 0 else None
+    processed_max_speakers = max_speakers if max_speakers is not None and max_speakers > 0 else None
+    # Validation logic for min/max speakers
+    if processed_min_speakers is not None and processed_max_speakers is not None and processed_min_speakers > processed_max_speakers:
+        return "", "", gr.Warning("Diarization: Min Speakers cannot be greater than Max Speakers.")
+    if processed_num_speakers is not None:
+        if processed_min_speakers is not None and processed_num_speakers < processed_min_speakers:
+            return "", "", gr.Warning("Diarization: Number of Speakers cannot be less than Min Speakers.")
+        if processed_max_speakers is not None and processed_num_speakers > processed_max_speakers:
+            return "", "", gr.Warning("Diarization: Number of Speakers cannot be greater than Max Speakers.")
     # Create an InferenceConfig object from Gradio inputs for internal validation and use.
     try:
         parameters = InferenceConfig(
             batch_size=batch_size,
             chunk_length_s=chunk_length_s,
             language=language if language != "Auto-detect" else None, # Convert "Auto-detect" to None for model
+            num_speakers=processed_num_speakers,
+            min_speakers=processed_min_speakers,
+            max_speakers=processed_max_speakers,
         )
     except Exception as e:
         logger.error(f"Error validating parameters: {e}")
+        return "", "", gr.Error(f"Error validating input parameters: {e}") # Use gr.Error for critical validation failures
     logger.info(f"Inference parameters: {parameters.model_dump_json()}")
     logger.info(f"Audio sampling rate: {sampling_rate} Hz, Audio shape: {audio_numpy_array.shape}")
     diarization_pipeline = models.get("diarization_pipeline")
     if not asr_pipeline:
+        return "", "", gr.Error("ASR model not loaded. Please restart the application.")
+    # ASR language and batch size conflict warning/error
+    if parameters.language is None and parameters.batch_size > 1:
+        return "", "", gr.Warning(
+            "ASR: 'Auto-detect' language is not supported with batch size > 1. "
+            "Please select a specific language or set batch size to 1."
+        )
     # Prepare ASR generation arguments
     generate_kwargs = {
             batch_size=parameters.batch_size,
             generate_kwargs=generate_kwargs,
             return_timestamps=True,
+            # sampling_rate=sampling_rate # Pass original sampling rate to pipeline
         )
         logger.info("ASR inference completed.")
     except Exception as e:
         logger.error(f"ASR inference error: {str(e)}")
+        return "", "", gr.Error(f"ASR inference error: {str(e)}")
     final_transcript_data = []
     status_message = ""
     fn=predict_audio,
     inputs=[
         gr.Audio(type="numpy", label="Upload Audio File (WAV, MP3, FLAC, etc.)"),
+        gr.Slider(minimum=1, maximum=32, value=1, step=1, label="ASR Batch Size"),
+        gr.Slider(minimum=1, maximum=30, value=30, step=1, label="ASR Chunk Length (seconds)"),
+        gr.Dropdown(WHISPER_LANGUAGES, value="Chinese", label="ASR Language"),
+        gr.Number(label="Diarization: Number of Speakers (optional)", value=None, precision=0, info="Expected total number of speakers (positive integer, or leave empty for auto-detect)."),
+        gr.Number(label="Diarization: Min Speakers (optional)", value=None, precision=0, info="Minimum number of speakers to detect (positive integer, or leave empty for auto-detect)."),
+        gr.Number(label="Diarization: Max Speakers (optional)", value=None, precision=0, info="Maximum number of speakers to detect (positive integer, or leave empty for auto-detect).")
     ],
     outputs=[
         gr.Textbox(label="Diarized Transcript", lines=10, interactive=False),
         "<br><b>Note:</b> For long audios or high concurrent usage, consider using a GPU and models like `whisper-large-v3`."
     ),
     allow_flagging="never", # Disable Gradio flagging feature
     examples=[
+        # Adjust this path if the `model-server/app/tests/` directory is not alongside your `app.py`
+        # For example, if app.py is in the root, and the audio is in a tests/ subdirectory,
+        # you might use: ["tests/polyai-minds14-0.wav", 24, 30, "Auto-detect", None, None, None]
         [os.path.join(os.path.dirname(__file__), "model-server", "app", "tests", "polyai-minds14-0.wav"), 24, 30, "Auto-detect", None, None, None]
     ],
+    cache_examples=False # Disable caching of examples to prevent InvalidPathError
 )
 if __name__ == "__main__":
+    logger.info("Starting Gradio demo...")
     demo.launch()