transcribe_audio

Running

App Files Files Community

cstr commited on Oct 2, 2024

Commit

fa54222

verified ·

1 Parent(s): 4347dae

+v

Browse files

Files changed (1) hide show

app.py +79 -45

app.py CHANGED Viewed

@@ -45,11 +45,11 @@ logging.info(f"Using device: {device}")
 def download_audio(url, method_choice):
     """
     Downloads audio from a given URL using the specified method.
     Args:
         url (str): The URL of the audio.
         method_choice (str): The method to use for downloading audio.
     Returns:
         tuple: (path to the downloaded audio file, is_temp_file), or (error message, False).
     """
@@ -64,11 +64,14 @@ def download_audio(url, method_choice):
             audio_file = download_direct_audio(url, method_choice)
         if not audio_file or not os.path.exists(audio_file):
-            raise Exception(f"Failed to download audio from {url}")
         return audio_file, True
     except Exception as e:
-        logging.error(f"Error downloading audio: {str(e)}")
-        return f"Error: {str(e)}", False
 def download_youtube_audio(url, method_choice):
     """
@@ -114,15 +117,20 @@ def yt_dlp_method(url):
             'preferredcodec': 'mp3',
             'preferredquality': '192',
         }],
-        'quiet': True,
         'no_warnings': True,
     }
-    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-        info = ydl.extract_info(url, download=True)
-        output_file = ydl.prepare_filename(info)
-        output_file = os.path.splitext(output_file)[0] + '.mp3'
-        logging.info(f"Downloaded YouTube audio: {output_file}")
-        return output_file
 def pytube_method(url):
     """
@@ -136,15 +144,24 @@ def pytube_method(url):
     """
     logging.info("Using pytube method")
     from pytube import YouTube
-    yt = YouTube(url)
-    audio_stream = yt.streams.filter(only_audio=True).first()
-    temp_dir = tempfile.mkdtemp()
-    out_file = audio_stream.download(output_path=temp_dir)
-    base, ext = os.path.splitext(out_file)
-    new_file = base + '.mp3'
-    os.rename(out_file, new_file)
-    logging.info(f"Downloaded and converted audio to: {new_file}")
-    return new_file
 def download_rtsp_audio(url):
     """
@@ -173,11 +190,11 @@ def download_rtsp_audio(url):
 def download_direct_audio(url, method_choice):
     """
     Downloads audio from a direct URL using the specified method.
     Args:
         url (str): The direct URL of the audio file.
         method_choice (str): The method to use for downloading.
     Returns:
         str: Path to the downloaded audio file, or None if failed.
     """
@@ -191,9 +208,14 @@ def download_direct_audio(url, method_choice):
     }
     method = methods.get(method_choice, requests_method)
     try:
-        return method(url)
     except Exception as e:
-        logging.error(f"Error downloading direct audio: {str(e)}")
         return None
 def requests_method(url):
@@ -402,10 +424,10 @@ loaded_models = {}
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     """
-    Transcribes audio from a given URL using the specified pipeline and model.
     Args:
-        input_source (str): URL of the audio.
         pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
         model_id (str): The ID of the model to use.
         dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
@@ -430,22 +452,36 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         if verbose:
             yield verbose_messages, "", None
-        # Input source is expected to be a URL
-        if not input_source or not input_source.strip():
-            yield "No audio URL provided.", "", None
-            return
-        # Download the audio from the URL
-        audio_path, is_temp_file = download_audio(input_source, download_method)
-        if not audio_path or audio_path.startswith("Error"):
-            yield f"Error downloading audio: {audio_path}", "", None
             return
         # Convert start_time and end_time to float or None
         start_time = float(start_time) if start_time else None
         end_time = float(end_time) if end_time else None
-        # Trim the audio if start or end times are provided
         if start_time is not None or end_time is not None:
             audio_path = trim_audio(audio_path, start_time, end_time)
             is_temp_file = True  # The trimmed audio is a temporary file
@@ -459,7 +495,6 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
             model_or_pipeline = loaded_models[model_key]
             logging.info("Loaded model from cache")
         else:
-            # Load the appropriate model or pipeline based on the pipeline type
             if pipeline_type == "faster-batched":
                 model = WhisperModel(model_id, device=device, compute_type=dtype)
                 model_or_pipeline = BatchedInferencePipeline(model=model)
@@ -489,10 +524,11 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
                     device=device,
                 )
             else:
-                raise ValueError("Invalid pipeline type")
             loaded_models[model_key] = model_or_pipeline  # Cache the model or pipeline
-        # Perform the transcription
         start_time_perf = time.time()
         if pipeline_type == "faster-batched":
             segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
@@ -503,7 +539,6 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
             segments = result["chunks"]
         end_time_perf = time.time()
-        # Calculate metrics
         transcription_time = end_time_perf - start_time_perf
         audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
@@ -515,7 +550,6 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
         if verbose:
             yield verbose_messages + metrics_output, "", None
-        # Compile the transcription text
         transcription = ""
         for segment in segments:
@@ -527,13 +561,13 @@ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, d
             if verbose:
                 yield verbose_messages + metrics_output, transcription, None
-        # Save the transcription to a file
         transcription_file = save_transcription(transcription)
         yield verbose_messages + metrics_output, transcription, transcription_file
     except Exception as e:
-        logging.error(f"An error occurred during transcription: {str(e)}")
-        yield f"An error occurred: {str(e)}", "", None
     finally:
         # Clean up temporary audio files

 def download_audio(url, method_choice):
     """
     Downloads audio from a given URL using the specified method.
     Args:
         url (str): The URL of the audio.
         method_choice (str): The method to use for downloading audio.
     Returns:
         tuple: (path to the downloaded audio file, is_temp_file), or (error message, False).
     """
             audio_file = download_direct_audio(url, method_choice)
         if not audio_file or not os.path.exists(audio_file):
+            error_msg = f"Failed to download audio from {url} using method {method_choice}"
+            logging.error(error_msg)
+            return error_msg, False
         return audio_file, True
     except Exception as e:
+        error_msg = f"Error downloading audio from {url} using method {method_choice}: {str(e)}"
+        logging.error(error_msg)
+        return error_msg, False
 def download_youtube_audio(url, method_choice):
     """
             'preferredcodec': 'mp3',
             'preferredquality': '192',
         }],
+        'quiet': False,
         'no_warnings': True,
+        'logger': logging.getLogger(),  # Capture yt-dlp logs
     }
+    try:
+        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+            info = ydl.extract_info(url, download=True)
+            output_file = ydl.prepare_filename(info)
+            output_file = os.path.splitext(output_file)[0] + '.mp3'
+            logging.info(f"Downloaded YouTube audio: {output_file}")
+            return output_file
+    except Exception as e:
+        logging.error(f"Error in yt_dlp_method: {str(e)}")
+        raise Exception(f"yt-dlp failed to download audio: {str(e)}")
 def pytube_method(url):
     """
     """
     logging.info("Using pytube method")
     from pytube import YouTube
+    try:
+        yt = YouTube(url)
+        audio_stream = yt.streams.filter(only_audio=True).first()
+        if audio_stream is None:
+            error_msg = "No audio streams available with pytube."
+            logging.error(error_msg)
+            raise Exception(error_msg)
+        temp_dir = tempfile.mkdtemp()
+        out_file = audio_stream.download(output_path=temp_dir)
+        base, ext = os.path.splitext(out_file)
+        new_file = base + '.mp3'
+        os.rename(out_file, new_file)
+        logging.info(f"Downloaded and converted audio to: {new_file}")
+        return new_file
+    except Exception as e:
+        logging.error(f"Error in pytube_method: {str(e)}")
+        raise Exception(f"pytube failed to download audio: {str(e)}")
 def download_rtsp_audio(url):
     """
 def download_direct_audio(url, method_choice):
     """
     Downloads audio from a direct URL using the specified method.
     Args:
         url (str): The direct URL of the audio file.
         method_choice (str): The method to use for downloading.
     Returns:
         str: Path to the downloaded audio file, or None if failed.
     """
     }
     method = methods.get(method_choice, requests_method)
     try:
+        audio_file = method(url)
+        if not audio_file or not os.path.exists(audio_file):
+            error_msg = f"Failed to download direct audio from {url} using method {method_choice}"
+            logging.error(error_msg)
+            return None
+        return audio_file
     except Exception as e:
+        logging.error(f"Error downloading direct audio with {method_choice}: {str(e)}")
         return None
 def requests_method(url):
 def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
     """
+    Transcribes audio from a given source using the specified pipeline and model.
     Args:
+        input_source (str or file): URL of audio, path to local file, or uploaded file object.
         pipeline_type (str): Type of pipeline to use ('faster-batched', 'faster-sequenced', or 'transformers').
         model_id (str): The ID of the model to use.
         dtype (str): Data type for model computations ('int8', 'float16', or 'float32').
         if verbose:
             yield verbose_messages, "", None
+        # Determine if input_source is a URL or file
+        audio_path = None
+        is_temp_file = False
+        if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
+            # Input source is a URL
+            audio_path, is_temp_file = download_audio(input_source, download_method)
+            if not audio_path or audio_path.startswith("Error"):
+                error_msg = f"Error downloading audio: {audio_path}"
+                logging.error(error_msg)
+                yield error_msg, "", None
+                return
+        elif isinstance(input_source, str) and os.path.exists(input_source):
+            # Input source is a local file path
+            audio_path = input_source
+            is_temp_file = False
+        elif hasattr(input_source, 'name'):
+            # Input source is an uploaded file object
+            audio_path = input_source.name
+            is_temp_file = False
+        else:
+            error_msg = "No valid audio source provided."
+            logging.error(error_msg)
+            yield error_msg, "", None
             return
         # Convert start_time and end_time to float or None
         start_time = float(start_time) if start_time else None
         end_time = float(end_time) if end_time else None
         if start_time is not None or end_time is not None:
             audio_path = trim_audio(audio_path, start_time, end_time)
             is_temp_file = True  # The trimmed audio is a temporary file
             model_or_pipeline = loaded_models[model_key]
             logging.info("Loaded model from cache")
         else:
             if pipeline_type == "faster-batched":
                 model = WhisperModel(model_id, device=device, compute_type=dtype)
                 model_or_pipeline = BatchedInferencePipeline(model=model)
                     device=device,
                 )
             else:
+                error_msg = "Invalid pipeline type"
+                logging.error(error_msg)
+                raise ValueError(error_msg)
             loaded_models[model_key] = model_or_pipeline  # Cache the model or pipeline
         start_time_perf = time.time()
         if pipeline_type == "faster-batched":
             segments, info = model_or_pipeline.transcribe(audio_path, batch_size=batch_size)
             segments = result["chunks"]
         end_time_perf = time.time()
         transcription_time = end_time_perf - start_time_perf
         audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
         if verbose:
             yield verbose_messages + metrics_output, "", None
         transcription = ""
         for segment in segments:
             if verbose:
                 yield verbose_messages + metrics_output, transcription, None
         transcription_file = save_transcription(transcription)
         yield verbose_messages + metrics_output, transcription, transcription_file
     except Exception as e:
+        error_msg = f"An error occurred during transcription: {str(e)}"
+        logging.error(error_msg)
+        yield error_msg, "", None
     finally:
         # Clean up temporary audio files