Spaces:
Running
Running
Implemented transcription fallback using Whisper
Browse files- .gitattributes +1 -0
- README.md +45 -14
- app.py +72 -38
- ffmpeg_setup.py +49 -0
- logging_config.py +16 -0
- requirements.txt +4 -0
- transcription.py +96 -0
- youtube_utils.py +24 -0
.gitattributes
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
|
2 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
3 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
4 |
*.bin filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,14 +1,45 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# YouTube Transcript SmolAgent Tool
|
2 |
+
|
3 |
+
## Overview
|
4 |
+
This is a tool to fetch transcripts for YouTube videos. It attempts to retrieve the official YouTube transcript first, and if that fails or if the user opts to skip it, it falls back to using Whisper ASR to transcribe the audio.
|
5 |
+
|
6 |
+
## Project Structure
|
7 |
+
- `app.py`: Main application file that sets up the Gradio interface and handles the transcript fetching logic.
|
8 |
+
- `transcription.py`: Contains functions for downloading audio, converting it to WAV format, and transcribing it using Whisper ASR.
|
9 |
+
- `youtube_utils.py`: Contains utility functions for extracting video IDs and fetching official YouTube transcripts.
|
10 |
+
- `logging_config.py`: Configures logging for the application.
|
11 |
+
- `ffmpeg_setup.py`: Ensures that `ffmpeg` is available in the system path.
|
12 |
+
|
13 |
+
## Dependencies
|
14 |
+
- Python
|
15 |
+
- pip
|
16 |
+
- Gradio
|
17 |
+
- yt-dlp
|
18 |
+
- torch
|
19 |
+
- transformers
|
20 |
+
- youtube_transcript_api
|
21 |
+
- ffmpeg
|
22 |
+
|
23 |
+
## How It Works
|
24 |
+
1. **User Input**: The user provides a YouTube URL and optionally chooses to skip the official transcript check.
|
25 |
+
2. **Transcript Fetching**:
|
26 |
+
- If the user opts to skip the official transcript, the tool directly uses Whisper ASR to transcribe the audio.
|
27 |
+
- Otherwise, it first attempts to fetch the official YouTube transcript.
|
28 |
+
- If the official transcript is not found, it falls back to using Whisper ASR.
|
29 |
+
3. **Whisper ASR**:
|
30 |
+
- Downloads the best audio track using `yt-dlp`.
|
31 |
+
- Converts the audio to WAV format using `ffmpeg`.
|
32 |
+
- Transcribes the audio using the Whisper ASR model from the `transformers` library.
|
33 |
+
4. **Output**: The transcript and logs are displayed in the Gradio interface.
|
34 |
+
|
35 |
+
## Running the Application
|
36 |
+
1. Ensure you have all dependencies installed:
|
37 |
+
```sh
|
38 |
+
pip install gradio yt-dlp torch transformers youtube_transcript_api
|
39 |
+
```
|
40 |
+
2. Run the application:
|
41 |
+
```sh
|
42 |
+
python app.py
|
43 |
+
```
|
44 |
+
3. Open the provided URL in your browser to access the Gradio interface.
|
45 |
+
|
app.py
CHANGED
@@ -1,45 +1,79 @@
|
|
1 |
-
import re
|
2 |
import gradio as gr
|
3 |
-
from youtube_transcript_api import YouTubeTranscriptApi
|
4 |
-
from huggingface_hub import InferenceClient
|
5 |
|
6 |
-
|
|
|
|
|
|
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
Extract the video ID from a given YouTube URL.
|
11 |
-
"""
|
12 |
-
# Typical patterns:
|
13 |
-
# https://www.youtube.com/watch?v=VIDEO_ID
|
14 |
-
# https://youtu.be/VIDEO_ID
|
15 |
-
# https://www.youtube.com/shorts/VIDEO_ID
|
16 |
-
pattern = r"(?:v=|/shorts/|\.be/)([^&\n?#]+)"
|
17 |
-
match = re.search(pattern, youtube_url)
|
18 |
-
if not match:
|
19 |
-
raise ValueError("Could not extract video ID from the provided URL.")
|
20 |
-
return match.group(1)
|
21 |
-
|
22 |
-
def fetch_transcript(youtube_url: str) -> str:
|
23 |
"""
|
24 |
-
|
|
|
|
|
25 |
"""
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
-
if __name__ == "__main__":
|
45 |
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
|
3 |
+
from logging_config import logger, log_buffer
|
4 |
+
from ffmpeg_setup import ensure_ffmpeg_in_path
|
5 |
+
from youtube_utils import fetch_youtube_transcript
|
6 |
+
from transcription import fallback_whisper_transcription
|
7 |
|
8 |
+
|
9 |
+
def get_transcript(youtube_url: str, skip_official_transcript: bool):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
"""
|
11 |
+
Fetch the official YouTube transcript,
|
12 |
+
or fall back to Whisper transcription if required
|
13 |
+
returns (partial_transcript, logs_so_far).
|
14 |
"""
|
15 |
+
# Clear logs
|
16 |
+
log_buffer.seek(0)
|
17 |
+
log_buffer.truncate()
|
18 |
+
|
19 |
+
logger.info(f"Received YouTube URL: {youtube_url}")
|
20 |
+
logger.info(f"Skip official transcript check? {skip_official_transcript}")
|
21 |
+
logger.info("")
|
22 |
+
yield "", log_buffer.getvalue()
|
23 |
+
|
24 |
+
# If user wants to skip the official transcript, go straight to fallback
|
25 |
+
if skip_official_transcript:
|
26 |
+
logger.info("User selected to skip official transcript.")
|
27 |
+
yield "", log_buffer.getvalue()
|
28 |
+
for partial_transcript, current_logs in fallback_whisper_transcription(youtube_url):
|
29 |
+
yield partial_transcript, current_logs
|
30 |
+
return
|
31 |
+
|
32 |
+
# Otherwise, try official transcript first
|
33 |
+
transcript = fetch_youtube_transcript(youtube_url)
|
34 |
+
logger.info("")
|
35 |
+
yield "", log_buffer.getvalue()
|
36 |
+
|
37 |
+
# If official transcript wasn't found, do fallback
|
38 |
+
if transcript.startswith("Error"):
|
39 |
+
logger.info("Transcript not found. Falling back to local Whisper transcription...")
|
40 |
+
logger.info("")
|
41 |
+
yield "", log_buffer.getvalue()
|
42 |
+
|
43 |
+
for partial_transcript, current_logs in fallback_whisper_transcription(youtube_url):
|
44 |
+
yield partial_transcript, current_logs
|
45 |
+
return
|
46 |
+
|
47 |
+
# Otherwise, we succeeded with the official transcript
|
48 |
+
logger.info("Official transcript found successfully.")
|
49 |
+
logger.info("")
|
50 |
+
yield transcript, log_buffer.getvalue()
|
51 |
+
|
52 |
+
|
53 |
+
def run_demo():
|
54 |
+
ensure_ffmpeg_in_path()
|
55 |
+
|
56 |
+
demo = gr.Interface(
|
57 |
+
fn=get_transcript,
|
58 |
+
inputs=[
|
59 |
+
gr.Textbox(label="YouTube URL"),
|
60 |
+
gr.Checkbox(label="Skip official transcript check?", value=False)
|
61 |
+
],
|
62 |
+
outputs=[
|
63 |
+
gr.Textbox(label="Transcript"),
|
64 |
+
gr.Textbox(label="Logs (Streaming)"),
|
65 |
+
],
|
66 |
+
title="YouTube Transcript SmolAgent Tool",
|
67 |
+
description=(
|
68 |
+
"Enter a YouTube link to retrieve its official transcript. "
|
69 |
+
"If that fails (or if 'Skip' is selected), we'll download the best "
|
70 |
+
"audio track with yt-dlp, convert it to WAV (via ffmpeg), and "
|
71 |
+
"then run Whisper to transcribe. Logs are displayed as it runs."
|
72 |
+
),
|
73 |
+
)
|
74 |
|
|
|
75 |
demo.launch()
|
76 |
+
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
run_demo()
|
ffmpeg_setup.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import stat
|
3 |
+
import shutil
|
4 |
+
import subprocess
|
5 |
+
import imageio_ffmpeg
|
6 |
+
from logging_config import logger
|
7 |
+
|
8 |
+
|
9 |
+
def is_ffmpeg_in_path() -> bool:
|
10 |
+
try:
|
11 |
+
subprocess.run(
|
12 |
+
["ffmpeg", "-version"],
|
13 |
+
stdout=subprocess.PIPE,
|
14 |
+
stderr=subprocess.PIPE,
|
15 |
+
check=True
|
16 |
+
)
|
17 |
+
return True
|
18 |
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
19 |
+
return False
|
20 |
+
|
21 |
+
|
22 |
+
def ensure_ffmpeg_in_path():
|
23 |
+
ffmpeg_path_original = imageio_ffmpeg.get_ffmpeg_exe()
|
24 |
+
ffmpeg_dir = os.path.dirname(ffmpeg_path_original)
|
25 |
+
binary_name = os.path.basename(ffmpeg_path_original)
|
26 |
+
|
27 |
+
logger.info(f"imageio-ffmpeg reported path: {ffmpeg_path_original}")
|
28 |
+
logger.info(f"Directory contents: {os.listdir(ffmpeg_dir)}")
|
29 |
+
logger.info(f"Binary name: {binary_name}")
|
30 |
+
|
31 |
+
expected_binary_name = "ffmpeg"
|
32 |
+
copied_path = os.path.join(ffmpeg_dir, expected_binary_name)
|
33 |
+
|
34 |
+
if not os.path.exists(copied_path):
|
35 |
+
logger.info(f"Copying {binary_name} to {expected_binary_name} in {ffmpeg_dir}.")
|
36 |
+
shutil.copy2(ffmpeg_path_original, copied_path)
|
37 |
+
st = os.stat(copied_path)
|
38 |
+
os.chmod(copied_path, st.st_mode | stat.S_IEXEC)
|
39 |
+
else:
|
40 |
+
logger.info(f"'{copied_path}' already exists; skipping copy.")
|
41 |
+
|
42 |
+
# Add directory to PATH
|
43 |
+
os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ["PATH"]
|
44 |
+
logger.info(f"PATH now: {os.environ['PATH']}")
|
45 |
+
|
46 |
+
if is_ffmpeg_in_path():
|
47 |
+
logger.info("FFmpeg is accessible on PATH.")
|
48 |
+
else:
|
49 |
+
logger.warning("Tried appending the directory to PATH, but 'ffmpeg' is still not found.")
|
logging_config.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import io
|
3 |
+
|
4 |
+
# StringIO buffer to capture logs for streaming
|
5 |
+
log_buffer = io.StringIO()
|
6 |
+
|
7 |
+
logger = logging.getLogger("my_logger")
|
8 |
+
logger.setLevel(logging.DEBUG)
|
9 |
+
|
10 |
+
# log handler that writes to our StringIO buffer
|
11 |
+
log_handler = logging.StreamHandler(log_buffer)
|
12 |
+
log_handler.setLevel(logging.DEBUG)
|
13 |
+
|
14 |
+
logger.addHandler(log_handler)
|
15 |
+
# prevent duplicate logs
|
16 |
+
logger.propagate = False
|
requirements.txt
CHANGED
@@ -1,3 +1,7 @@
|
|
1 |
huggingface_hub==0.25.2
|
2 |
gradio==5.12.0
|
3 |
youtube-transcript-api==0.6.3
|
|
|
|
|
|
|
|
|
|
1 |
huggingface_hub==0.25.2
|
2 |
gradio==5.12.0
|
3 |
youtube-transcript-api==0.6.3
|
4 |
+
yt-dlp==2025.1.15
|
5 |
+
transformers==4.48.1
|
6 |
+
torch==2.2.2
|
7 |
+
imageio-ffmpeg==0.6.0
|
transcription.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import subprocess
|
3 |
+
import tempfile
|
4 |
+
import yt_dlp
|
5 |
+
import torch
|
6 |
+
from transformers import pipeline
|
7 |
+
from logging_config import logger, log_buffer
|
8 |
+
|
9 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
+
|
11 |
+
|
12 |
+
def convert_audio_to_wav(input_file: str, output_file: str) -> str:
|
13 |
+
logger.info(f"Converting {input_file} to WAV: {output_file}")
|
14 |
+
cmd = [
|
15 |
+
"ffmpeg",
|
16 |
+
"-y",
|
17 |
+
"-i", input_file,
|
18 |
+
"-ar", "16000", # sample rate
|
19 |
+
"-ac", "1", # mono
|
20 |
+
output_file
|
21 |
+
]
|
22 |
+
subprocess.run(cmd, check=True)
|
23 |
+
return output_file
|
24 |
+
|
25 |
+
|
26 |
+
def fallback_whisper_transcription(youtube_url: str):
|
27 |
+
# returns (transcript, logs).
|
28 |
+
try:
|
29 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
30 |
+
# Create temp dir
|
31 |
+
logger.info("")
|
32 |
+
logger.info(f"Created temporary directory: {tmpdir}")
|
33 |
+
logger.info("")
|
34 |
+
yield "", log_buffer.getvalue()
|
35 |
+
|
36 |
+
# Download best audio
|
37 |
+
logger.info("Downloading best audio via yt-dlp...")
|
38 |
+
logger.info("")
|
39 |
+
yield "", log_buffer.getvalue()
|
40 |
+
|
41 |
+
download_path = os.path.join(tmpdir, "audio.%(ext)s")
|
42 |
+
ydl_opts = {
|
43 |
+
'format': 'bestaudio/best',
|
44 |
+
'outtmpl': download_path,
|
45 |
+
'quiet': True,
|
46 |
+
'postprocessors': []
|
47 |
+
}
|
48 |
+
|
49 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
50 |
+
ydl.download([youtube_url])
|
51 |
+
|
52 |
+
logger.info("Audio downloaded. Locating the audio file in the temp folder...")
|
53 |
+
logger.info("")
|
54 |
+
yield "", log_buffer.getvalue()
|
55 |
+
|
56 |
+
# confirm audio file
|
57 |
+
downloaded_files = os.listdir(tmpdir)
|
58 |
+
if not downloaded_files:
|
59 |
+
raise RuntimeError("No audio file was downloaded via yt-dlp.")
|
60 |
+
|
61 |
+
audio_file_path = os.path.join(tmpdir, downloaded_files[0])
|
62 |
+
logger.info(f"Found audio file: {audio_file_path}")
|
63 |
+
logger.info("Video has downloaded!")
|
64 |
+
logger.info("")
|
65 |
+
yield "", log_buffer.getvalue()
|
66 |
+
|
67 |
+
# Convert to wav
|
68 |
+
wav_file_path = os.path.join(tmpdir, "audio.wav")
|
69 |
+
convert_audio_to_wav(audio_file_path, wav_file_path)
|
70 |
+
logger.info("Audio converted to WAV successfully.")
|
71 |
+
logger.info("")
|
72 |
+
yield "", log_buffer.getvalue()
|
73 |
+
|
74 |
+
# Run whisper
|
75 |
+
logger.info("Running Whisper ASR pipeline on the WAV file...")
|
76 |
+
logger.info("")
|
77 |
+
yield "", log_buffer.getvalue()
|
78 |
+
|
79 |
+
asr_pipeline = pipeline(
|
80 |
+
"automatic-speech-recognition",
|
81 |
+
model="openai/whisper-small",
|
82 |
+
return_timestamps=True,
|
83 |
+
device=device,
|
84 |
+
generate_kwargs={"task": "transcribe", "language": "<|en|>"}
|
85 |
+
)
|
86 |
+
result = asr_pipeline(inputs=wav_file_path)
|
87 |
+
transcription = result["text"]
|
88 |
+
|
89 |
+
logger.info("Whisper transcription completed successfully.")
|
90 |
+
logger.info("")
|
91 |
+
yield transcription, log_buffer.getvalue()
|
92 |
+
|
93 |
+
except Exception as e:
|
94 |
+
err_msg = f"Error in fallback transcription: {str(e)}"
|
95 |
+
logger.error(err_msg)
|
96 |
+
yield err_msg, log_buffer.getvalue()
|
youtube_utils.py
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
3 |
+
from logging_config import logger
|
4 |
+
|
5 |
+
|
6 |
+
def get_video_id(youtube_url: str) -> str:
|
7 |
+
pattern = r"(?:v=|/shorts/|\.be/)([^&\n?#]+)"
|
8 |
+
match = re.search(pattern, youtube_url)
|
9 |
+
if not match:
|
10 |
+
raise ValueError("Could not extract video ID from the provided URL.")
|
11 |
+
return match.group(1)
|
12 |
+
|
13 |
+
|
14 |
+
def fetch_youtube_transcript(youtube_url: str) -> str:
|
15 |
+
try:
|
16 |
+
video_id = get_video_id(youtube_url)
|
17 |
+
logger.info(f"Fetching official YouTube transcript for video ID: {video_id}")
|
18 |
+
transcript_data = YouTubeTranscriptApi.get_transcript(video_id)
|
19 |
+
transcript = " ".join([entry["text"] for entry in transcript_data])
|
20 |
+
return transcript
|
21 |
+
except Exception as e:
|
22 |
+
err_msg = f"Error fetching transcript: {str(e)}"
|
23 |
+
logger.error(err_msg)
|
24 |
+
return err_msg
|