Spaces:
Running
Running
File size: 6,858 Bytes
7aea2f1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
import streamlit as st
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable
from pytube import YouTube
import re
from typing import Dict, List, Optional, Tuple
import time
def extract_video_id(url: str) -> Optional[str]:
"""Extract YouTube video ID from various URL formats."""
patterns = [
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
r'(?:embed\/)([0-9A-Za-z_-]{11})',
r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def get_video_info(url: str) -> Tuple[str, str]:
"""Get video title and author using pytube."""
try:
yt = YouTube(url)
return yt.title, yt.author
except Exception as e:
return "Unknown", "Unknown"
def get_transcript_with_retries(video_id: str, max_retries: int = 3) -> Dict:
"""Get transcript with multiple retries and error handling."""
result = {
'success': False,
'transcript': '',
'error': None,
'language': None
}
for attempt in range(max_retries):
try:
# First try to get available transcript list
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
# Try to get English transcript first
try:
transcript = transcript_list.find_transcript(['en'])
except NoTranscriptFound:
# If no English, get the first available transcript
transcript = transcript_list.find_transcript(['en-US', 'en-GB'])
# Get the actual transcript
transcript_data = transcript.fetch()
result['success'] = True
result['transcript'] = ' '.join([entry['text'] for entry in transcript_data])
result['language'] = transcript.language
return result
except TranscriptsDisabled:
result['error'] = "Transcripts are disabled for this video"
break
except VideoUnavailable:
result['error'] = "Video is unavailable"
break
except NoTranscriptFound:
result['error'] = "No transcripts found for this video"
break
except Exception as e:
if attempt == max_retries - 1:
result['error'] = f"Error: {str(e)}"
time.sleep(1) # Wait before retry
return result
st.set_page_config(page_title="YouTube Transcript Extractor", layout="wide")
st.title('π YouTube Transcript Extractor')
st.markdown("""
This app extracts transcripts from YouTube videos. Simply paste one or more YouTube URLs below.
- Supports multiple YouTube URL formats
- Handles videos with disabled subtitles
- Provides detailed error messages
- Allows downloading results as CSV
""")
# Text area for input
urls = st.text_area('YouTube URLs (one per line)',
height=150,
help='Enter YouTube URLs, one per line',
placeholder="https://www.youtube.com/watch?v=...")
if st.button('Extract Transcripts', type='primary'):
if urls:
# Split URLs into list and clean
url_list = [url.strip() for url in urls.split('\n') if url.strip()]
if url_list:
results = []
# Progress tracking
progress_text = "Extracting transcripts..."
progress_bar = st.progress(0, text=progress_text)
for i, url in enumerate(url_list):
video_id = extract_video_id(url)
if video_id:
# Get video info
title, author = get_video_info(url)
# Get transcript
transcript_result = get_transcript_with_retries(video_id)
results.append({
'URL': url,
'Video ID': video_id,
'Title': title,
'Channel': author,
'Status': 'Success' if transcript_result['success'] else 'Error',
'Language': transcript_result['language'] if transcript_result['success'] else None,
'Error': transcript_result['error'],
'Transcript': transcript_result['transcript'] if transcript_result['success'] else None
})
else:
results.append({
'URL': url,
'Video ID': None,
'Title': 'Unknown',
'Channel': 'Unknown',
'Status': 'Error',
'Language': None,
'Error': 'Invalid YouTube URL',
'Transcript': None
})
# Update progress
progress_bar.progress((i + 1) / len(url_list),
text=f"{progress_text} ({i + 1}/{len(url_list)})")
# Create DataFrame
df = pd.DataFrame(results)
# Display results in tabs
tab1, tab2 = st.tabs(["π Results Overview", "π Full Transcripts"])
with tab1:
st.dataframe(df[['URL', 'Title', 'Channel', 'Status', 'Language', 'Error']],
use_container_width=True)
with tab2:
for _, row in df.iterrows():
if row['Status'] == 'Success':
with st.expander(f"πΊ {row['Title']} ({row['Channel']})"):
st.text_area("Transcript:",
value=row['Transcript'],
height=200,
key=f"transcript_{row['Video ID']}")
# Download button
if not df.empty:
csv = df.to_csv(index=False)
st.download_button(
label="β¬οΈ Download results as CSV",
data=csv,
file_name="youtube_transcripts.csv",
mime="text/csv"
)
else:
st.error('Please enter valid YouTube URLs')
else:
st.warning('Please enter at least one YouTube URL')
# Add footer with information
st.markdown("---")
st.markdown("""
<div style='text-align: center'>
<p>π‘ <i>Note: This tool works best with videos that have enabled subtitles/transcripts.</i></p>
</div>
""", unsafe_allow_html=True) |