Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable | |
from pytube import YouTube | |
import re | |
from typing import Dict, List, Optional, Tuple | |
import time | |
def extract_video_id(url: str) -> Optional[str]: | |
"""Extract YouTube video ID from various URL formats.""" | |
patterns = [ | |
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', | |
r'(?:embed\/)([0-9A-Za-z_-]{11})', | |
r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})' | |
] | |
for pattern in patterns: | |
match = re.search(pattern, url) | |
if match: | |
return match.group(1) | |
return None | |
def get_video_info(url: str) -> Tuple[str, str]: | |
"""Get video title and author using pytube.""" | |
try: | |
yt = YouTube(url) | |
return yt.title, yt.author | |
except Exception as e: | |
return "Unknown", "Unknown" | |
def get_transcript_with_retries(video_id: str, max_retries: int = 3) -> Dict: | |
"""Get transcript with multiple retries and error handling.""" | |
result = { | |
'success': False, | |
'transcript': '', | |
'error': None, | |
'language': None | |
} | |
for attempt in range(max_retries): | |
try: | |
# First try to get available transcript list | |
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) | |
# Try to get English transcript first | |
try: | |
transcript = transcript_list.find_transcript(['en']) | |
except NoTranscriptFound: | |
# If no English, get the first available transcript | |
transcript = transcript_list.find_transcript(['en-US', 'en-GB']) | |
# Get the actual transcript | |
transcript_data = transcript.fetch() | |
result['success'] = True | |
result['transcript'] = ' '.join([entry['text'] for entry in transcript_data]) | |
result['language'] = transcript.language | |
return result | |
except TranscriptsDisabled: | |
result['error'] = "Transcripts are disabled for this video" | |
break | |
except VideoUnavailable: | |
result['error'] = "Video is unavailable" | |
break | |
except NoTranscriptFound: | |
result['error'] = "No transcripts found for this video" | |
break | |
except Exception as e: | |
if attempt == max_retries - 1: | |
result['error'] = f"Error: {str(e)}" | |
time.sleep(1) # Wait before retry | |
return result | |
st.set_page_config(page_title="YouTube Transcript Extractor", layout="wide") | |
st.title('π YouTube Transcript Extractor') | |
st.markdown(""" | |
This app extracts transcripts from YouTube videos. Simply paste one or more YouTube URLs below. | |
- Supports multiple YouTube URL formats | |
- Handles videos with disabled subtitles | |
- Provides detailed error messages | |
- Allows downloading results as CSV | |
""") | |
# Text area for input | |
urls = st.text_area('YouTube URLs (one per line)', | |
height=150, | |
help='Enter YouTube URLs, one per line', | |
placeholder="https://www.youtube.com/watch?v=...") | |
if st.button('Extract Transcripts', type='primary'): | |
if urls: | |
# Split URLs into list and clean | |
url_list = [url.strip() for url in urls.split('\n') if url.strip()] | |
if url_list: | |
results = [] | |
# Progress tracking | |
progress_text = "Extracting transcripts..." | |
progress_bar = st.progress(0, text=progress_text) | |
for i, url in enumerate(url_list): | |
video_id = extract_video_id(url) | |
if video_id: | |
# Get video info | |
title, author = get_video_info(url) | |
# Get transcript | |
transcript_result = get_transcript_with_retries(video_id) | |
results.append({ | |
'URL': url, | |
'Video ID': video_id, | |
'Title': title, | |
'Channel': author, | |
'Status': 'Success' if transcript_result['success'] else 'Error', | |
'Language': transcript_result['language'] if transcript_result['success'] else None, | |
'Error': transcript_result['error'], | |
'Transcript': transcript_result['transcript'] if transcript_result['success'] else None | |
}) | |
else: | |
results.append({ | |
'URL': url, | |
'Video ID': None, | |
'Title': 'Unknown', | |
'Channel': 'Unknown', | |
'Status': 'Error', | |
'Language': None, | |
'Error': 'Invalid YouTube URL', | |
'Transcript': None | |
}) | |
# Update progress | |
progress_bar.progress((i + 1) / len(url_list), | |
text=f"{progress_text} ({i + 1}/{len(url_list)})") | |
# Create DataFrame | |
df = pd.DataFrame(results) | |
# Display results in tabs | |
tab1, tab2 = st.tabs(["π Results Overview", "π Full Transcripts"]) | |
with tab1: | |
st.dataframe(df[['URL', 'Title', 'Channel', 'Status', 'Language', 'Error']], | |
use_container_width=True) | |
with tab2: | |
for _, row in df.iterrows(): | |
if row['Status'] == 'Success': | |
with st.expander(f"πΊ {row['Title']} ({row['Channel']})"): | |
st.text_area("Transcript:", | |
value=row['Transcript'], | |
height=200, | |
key=f"transcript_{row['Video ID']}") | |
# Download button | |
if not df.empty: | |
csv = df.to_csv(index=False) | |
st.download_button( | |
label="β¬οΈ Download results as CSV", | |
data=csv, | |
file_name="youtube_transcripts.csv", | |
mime="text/csv" | |
) | |
else: | |
st.error('Please enter valid YouTube URLs') | |
else: | |
st.warning('Please enter at least one YouTube URL') | |
# Add footer with information | |
st.markdown("---") | |
st.markdown(""" | |
<div style='text-align: center'> | |
<p>π‘ <i>Note: This tool works best with videos that have enabled subtitles/transcripts.</i></p> | |
</div> | |
""", unsafe_allow_html=True) |