maguid28 commited on
Commit
4f48868
·
1 Parent(s): bcae73d

Implemented transcription fallback using Whisper

Browse files
Files changed (8) hide show
  1. .gitattributes +1 -0
  2. README.md +45 -14
  3. app.py +72 -38
  4. ffmpeg_setup.py +49 -0
  5. logging_config.py +16 -0
  6. requirements.txt +4 -0
  7. transcription.py +96 -0
  8. youtube_utils.py +24 -0
.gitattributes CHANGED
@@ -1,3 +1,4 @@
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
1
+
2
  *.7z filter=lfs diff=lfs merge=lfs -text
3
  *.arrow filter=lfs diff=lfs merge=lfs -text
4
  *.bin filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,14 +1,45 @@
1
- ---
2
- title: VideoTranscription
3
- emoji: 💬
4
- colorFrom: yellow
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.0.1
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- short_description: Transcribe video
12
- ---
13
-
14
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # YouTube Transcript SmolAgent Tool
2
+
3
+ ## Overview
4
+ This is a tool to fetch transcripts for YouTube videos. It attempts to retrieve the official YouTube transcript first, and if that fails or if the user opts to skip it, it falls back to using Whisper ASR to transcribe the audio.
5
+
6
+ ## Project Structure
7
+ - `app.py`: Main application file that sets up the Gradio interface and handles the transcript fetching logic.
8
+ - `transcription.py`: Contains functions for downloading audio, converting it to WAV format, and transcribing it using Whisper ASR.
9
+ - `youtube_utils.py`: Contains utility functions for extracting video IDs and fetching official YouTube transcripts.
10
+ - `logging_config.py`: Configures logging for the application.
11
+ - `ffmpeg_setup.py`: Ensures that `ffmpeg` is available in the system path.
12
+
13
+ ## Dependencies
14
+ - Python
15
+ - pip
16
+ - Gradio
17
+ - yt-dlp
18
+ - torch
19
+ - transformers
20
+ - youtube_transcript_api
21
+ - ffmpeg
22
+
23
+ ## How It Works
24
+ 1. **User Input**: The user provides a YouTube URL and optionally chooses to skip the official transcript check.
25
+ 2. **Transcript Fetching**:
26
+ - If the user opts to skip the official transcript, the tool directly uses Whisper ASR to transcribe the audio.
27
+ - Otherwise, it first attempts to fetch the official YouTube transcript.
28
+ - If the official transcript is not found, it falls back to using Whisper ASR.
29
+ 3. **Whisper ASR**:
30
+ - Downloads the best audio track using `yt-dlp`.
31
+ - Converts the audio to WAV format using `ffmpeg`.
32
+ - Transcribes the audio using the Whisper ASR model from the `transformers` library.
33
+ 4. **Output**: The transcript and logs are displayed in the Gradio interface.
34
+
35
+ ## Running the Application
36
+ 1. Ensure you have all dependencies installed:
37
+ ```sh
38
+ pip install gradio yt-dlp torch transformers youtube_transcript_api
39
+ ```
40
+ 2. Run the application:
41
+ ```sh
42
+ python app.py
43
+ ```
44
+ 3. Open the provided URL in your browser to access the Gradio interface.
45
+
app.py CHANGED
@@ -1,45 +1,79 @@
1
- import re
2
  import gradio as gr
3
- from youtube_transcript_api import YouTubeTranscriptApi
4
- from huggingface_hub import InferenceClient
5
 
6
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
 
 
7
 
8
- def get_video_id(youtube_url: str) -> str:
9
- """
10
- Extract the video ID from a given YouTube URL.
11
- """
12
- # Typical patterns:
13
- # https://www.youtube.com/watch?v=VIDEO_ID
14
- # https://youtu.be/VIDEO_ID
15
- # https://www.youtube.com/shorts/VIDEO_ID
16
- pattern = r"(?:v=|/shorts/|\.be/)([^&\n?#]+)"
17
- match = re.search(pattern, youtube_url)
18
- if not match:
19
- raise ValueError("Could not extract video ID from the provided URL.")
20
- return match.group(1)
21
-
22
- def fetch_transcript(youtube_url: str) -> str:
23
  """
24
- Given a YouTube URL, fetch the transcript and return it as a single string.
 
 
25
  """
26
- try:
27
- video_id = get_video_id(youtube_url)
28
- # Fetch transcript
29
- transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
30
- # Combine the transcript lines
31
- transcript = " ".join([entry["text"] for entry in transcript_data])
32
- return transcript
33
- except Exception as e:
34
- return f"Error fetching transcript: {str(e)}"
35
-
36
- demo = gr.Interface(
37
- fn=fetch_transcript,
38
- inputs=gr.Textbox(label="YouTube URL"),
39
- outputs="text",
40
- title="YouTube Transcript Fetcher",
41
- description="Enter a YouTube link to retrieve its transcript."
42
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- if __name__ == "__main__":
45
  demo.launch()
 
 
 
 
 
 
1
  import gradio as gr
 
 
2
 
3
+ from logging_config import logger, log_buffer
4
+ from ffmpeg_setup import ensure_ffmpeg_in_path
5
+ from youtube_utils import fetch_youtube_transcript
6
+ from transcription import fallback_whisper_transcription
7
 
8
+
9
+ def get_transcript(youtube_url: str, skip_official_transcript: bool):
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  """
11
+ Fetch the official YouTube transcript,
12
+ or fall back to Whisper transcription if required
13
+ returns (partial_transcript, logs_so_far).
14
  """
15
+ # Clear logs
16
+ log_buffer.seek(0)
17
+ log_buffer.truncate()
18
+
19
+ logger.info(f"Received YouTube URL: {youtube_url}")
20
+ logger.info(f"Skip official transcript check? {skip_official_transcript}")
21
+ logger.info("")
22
+ yield "", log_buffer.getvalue()
23
+
24
+ # If user wants to skip the official transcript, go straight to fallback
25
+ if skip_official_transcript:
26
+ logger.info("User selected to skip official transcript.")
27
+ yield "", log_buffer.getvalue()
28
+ for partial_transcript, current_logs in fallback_whisper_transcription(youtube_url):
29
+ yield partial_transcript, current_logs
30
+ return
31
+
32
+ # Otherwise, try official transcript first
33
+ transcript = fetch_youtube_transcript(youtube_url)
34
+ logger.info("")
35
+ yield "", log_buffer.getvalue()
36
+
37
+ # If official transcript wasn't found, do fallback
38
+ if transcript.startswith("Error"):
39
+ logger.info("Transcript not found. Falling back to local Whisper transcription...")
40
+ logger.info("")
41
+ yield "", log_buffer.getvalue()
42
+
43
+ for partial_transcript, current_logs in fallback_whisper_transcription(youtube_url):
44
+ yield partial_transcript, current_logs
45
+ return
46
+
47
+ # Otherwise, we succeeded with the official transcript
48
+ logger.info("Official transcript found successfully.")
49
+ logger.info("")
50
+ yield transcript, log_buffer.getvalue()
51
+
52
+
53
+ def run_demo():
54
+ ensure_ffmpeg_in_path()
55
+
56
+ demo = gr.Interface(
57
+ fn=get_transcript,
58
+ inputs=[
59
+ gr.Textbox(label="YouTube URL"),
60
+ gr.Checkbox(label="Skip official transcript check?", value=False)
61
+ ],
62
+ outputs=[
63
+ gr.Textbox(label="Transcript"),
64
+ gr.Textbox(label="Logs (Streaming)"),
65
+ ],
66
+ title="YouTube Transcript SmolAgent Tool",
67
+ description=(
68
+ "Enter a YouTube link to retrieve its official transcript. "
69
+ "If that fails (or if 'Skip' is selected), we'll download the best "
70
+ "audio track with yt-dlp, convert it to WAV (via ffmpeg), and "
71
+ "then run Whisper to transcribe. Logs are displayed as it runs."
72
+ ),
73
+ )
74
 
 
75
  demo.launch()
76
+
77
+
78
+ if __name__ == "__main__":
79
+ run_demo()
ffmpeg_setup.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import stat
3
+ import shutil
4
+ import subprocess
5
+ import imageio_ffmpeg
6
+ from logging_config import logger
7
+
8
+
9
+ def is_ffmpeg_in_path() -> bool:
10
+ try:
11
+ subprocess.run(
12
+ ["ffmpeg", "-version"],
13
+ stdout=subprocess.PIPE,
14
+ stderr=subprocess.PIPE,
15
+ check=True
16
+ )
17
+ return True
18
+ except (subprocess.CalledProcessError, FileNotFoundError):
19
+ return False
20
+
21
+
22
+ def ensure_ffmpeg_in_path():
23
+ ffmpeg_path_original = imageio_ffmpeg.get_ffmpeg_exe()
24
+ ffmpeg_dir = os.path.dirname(ffmpeg_path_original)
25
+ binary_name = os.path.basename(ffmpeg_path_original)
26
+
27
+ logger.info(f"imageio-ffmpeg reported path: {ffmpeg_path_original}")
28
+ logger.info(f"Directory contents: {os.listdir(ffmpeg_dir)}")
29
+ logger.info(f"Binary name: {binary_name}")
30
+
31
+ expected_binary_name = "ffmpeg"
32
+ copied_path = os.path.join(ffmpeg_dir, expected_binary_name)
33
+
34
+ if not os.path.exists(copied_path):
35
+ logger.info(f"Copying {binary_name} to {expected_binary_name} in {ffmpeg_dir}.")
36
+ shutil.copy2(ffmpeg_path_original, copied_path)
37
+ st = os.stat(copied_path)
38
+ os.chmod(copied_path, st.st_mode | stat.S_IEXEC)
39
+ else:
40
+ logger.info(f"'{copied_path}' already exists; skipping copy.")
41
+
42
+ # Add directory to PATH
43
+ os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ["PATH"]
44
+ logger.info(f"PATH now: {os.environ['PATH']}")
45
+
46
+ if is_ffmpeg_in_path():
47
+ logger.info("FFmpeg is accessible on PATH.")
48
+ else:
49
+ logger.warning("Tried appending the directory to PATH, but 'ffmpeg' is still not found.")
logging_config.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import io
3
+
4
+ # StringIO buffer to capture logs for streaming
5
+ log_buffer = io.StringIO()
6
+
7
+ logger = logging.getLogger("my_logger")
8
+ logger.setLevel(logging.DEBUG)
9
+
10
+ # log handler that writes to our StringIO buffer
11
+ log_handler = logging.StreamHandler(log_buffer)
12
+ log_handler.setLevel(logging.DEBUG)
13
+
14
+ logger.addHandler(log_handler)
15
+ # prevent duplicate logs
16
+ logger.propagate = False
requirements.txt CHANGED
@@ -1,3 +1,7 @@
1
  huggingface_hub==0.25.2
2
  gradio==5.12.0
3
  youtube-transcript-api==0.6.3
 
 
 
 
 
1
  huggingface_hub==0.25.2
2
  gradio==5.12.0
3
  youtube-transcript-api==0.6.3
4
+ yt-dlp==2025.1.15
5
+ transformers==4.48.1
6
+ torch==2.2.2
7
+ imageio-ffmpeg==0.6.0
transcription.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import tempfile
4
+ import yt_dlp
5
+ import torch
6
+ from transformers import pipeline
7
+ from logging_config import logger, log_buffer
8
+
9
+ device = "cuda" if torch.cuda.is_available() else "cpu"
10
+
11
+
12
+ def convert_audio_to_wav(input_file: str, output_file: str) -> str:
13
+ logger.info(f"Converting {input_file} to WAV: {output_file}")
14
+ cmd = [
15
+ "ffmpeg",
16
+ "-y",
17
+ "-i", input_file,
18
+ "-ar", "16000", # sample rate
19
+ "-ac", "1", # mono
20
+ output_file
21
+ ]
22
+ subprocess.run(cmd, check=True)
23
+ return output_file
24
+
25
+
26
+ def fallback_whisper_transcription(youtube_url: str):
27
+ # returns (transcript, logs).
28
+ try:
29
+ with tempfile.TemporaryDirectory() as tmpdir:
30
+ # Create temp dir
31
+ logger.info("")
32
+ logger.info(f"Created temporary directory: {tmpdir}")
33
+ logger.info("")
34
+ yield "", log_buffer.getvalue()
35
+
36
+ # Download best audio
37
+ logger.info("Downloading best audio via yt-dlp...")
38
+ logger.info("")
39
+ yield "", log_buffer.getvalue()
40
+
41
+ download_path = os.path.join(tmpdir, "audio.%(ext)s")
42
+ ydl_opts = {
43
+ 'format': 'bestaudio/best',
44
+ 'outtmpl': download_path,
45
+ 'quiet': True,
46
+ 'postprocessors': []
47
+ }
48
+
49
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
50
+ ydl.download([youtube_url])
51
+
52
+ logger.info("Audio downloaded. Locating the audio file in the temp folder...")
53
+ logger.info("")
54
+ yield "", log_buffer.getvalue()
55
+
56
+ # confirm audio file
57
+ downloaded_files = os.listdir(tmpdir)
58
+ if not downloaded_files:
59
+ raise RuntimeError("No audio file was downloaded via yt-dlp.")
60
+
61
+ audio_file_path = os.path.join(tmpdir, downloaded_files[0])
62
+ logger.info(f"Found audio file: {audio_file_path}")
63
+ logger.info("Video has downloaded!")
64
+ logger.info("")
65
+ yield "", log_buffer.getvalue()
66
+
67
+ # Convert to wav
68
+ wav_file_path = os.path.join(tmpdir, "audio.wav")
69
+ convert_audio_to_wav(audio_file_path, wav_file_path)
70
+ logger.info("Audio converted to WAV successfully.")
71
+ logger.info("")
72
+ yield "", log_buffer.getvalue()
73
+
74
+ # Run whisper
75
+ logger.info("Running Whisper ASR pipeline on the WAV file...")
76
+ logger.info("")
77
+ yield "", log_buffer.getvalue()
78
+
79
+ asr_pipeline = pipeline(
80
+ "automatic-speech-recognition",
81
+ model="openai/whisper-small",
82
+ return_timestamps=True,
83
+ device=device,
84
+ generate_kwargs={"task": "transcribe", "language": "<|en|>"}
85
+ )
86
+ result = asr_pipeline(inputs=wav_file_path)
87
+ transcription = result["text"]
88
+
89
+ logger.info("Whisper transcription completed successfully.")
90
+ logger.info("")
91
+ yield transcription, log_buffer.getvalue()
92
+
93
+ except Exception as e:
94
+ err_msg = f"Error in fallback transcription: {str(e)}"
95
+ logger.error(err_msg)
96
+ yield err_msg, log_buffer.getvalue()
youtube_utils.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from youtube_transcript_api import YouTubeTranscriptApi
3
+ from logging_config import logger
4
+
5
+
6
+ def get_video_id(youtube_url: str) -> str:
7
+ pattern = r"(?:v=|/shorts/|\.be/)([^&\n?#]+)"
8
+ match = re.search(pattern, youtube_url)
9
+ if not match:
10
+ raise ValueError("Could not extract video ID from the provided URL.")
11
+ return match.group(1)
12
+
13
+
14
+ def fetch_youtube_transcript(youtube_url: str) -> str:
15
+ try:
16
+ video_id = get_video_id(youtube_url)
17
+ logger.info(f"Fetching official YouTube transcript for video ID: {video_id}")
18
+ transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
19
+ transcript = " ".join([entry["text"] for entry in transcript_data])
20
+ return transcript
21
+ except Exception as e:
22
+ err_msg = f"Error fetching transcript: {str(e)}"
23
+ logger.error(err_msg)
24
+ return err_msg