Spaces:
Running
Running
File size: 6,858 Bytes
7aea2f1 |
|
import streamlit as st
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable
from pytube import YouTube
import re
from typing import Dict, List, Optional, Tuple
import time
def extract_video_id(url: str) -> Optional[str]:
"""Extract YouTube video ID from various URL formats."""
patterns = [
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
r'(?:embed\/)([0-9A-Za-z_-]{11})',
r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
]
for pattern in patterns:
match = re.search(pattern, url)
if match:
return match.group(1)
return None
def get_video_info(url: str) -> Tuple[str, str]:
"""Get video title and author using pytube."""
try:
yt = YouTube(url)
return yt.title, yt.author
except Exception as e:
return "Unknown", "Unknown"
def get_transcript_with_retries(video_id: str, max_retries: int = 3) -> Dict:
"""Get transcript with multiple retries and error handling."""
result = {
'success': False,
'transcript': '',
'error': None,
'language': None
}
for attempt in range(max_retries):
try:
# First try to get available transcript list
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
# Try to get English transcript first
try:
transcript = transcript_list.find_transcript(['en'])
except NoTranscriptFound:
# If no English, get the first available transcript
transcript = transcript_list.find_transcript(['en-US', 'en-GB'])
# Get the actual transcript
transcript_data = transcript.fetch()
result['success'] = True
result['transcript'] = ' '.join([entry['text'] for entry in transcript_data])
result['language'] = transcript.language
return result
except TranscriptsDisabled:
result['error'] = "Transcripts are disabled for this video"
break
except VideoUnavailable:
result['error'] = "Video is unavailable"
break
except NoTranscriptFound:
result['error'] = "No transcripts found for this video"
break
except Exception as e:
if attempt == max_retries - 1:
result['error'] = f"Error: {str(e)}"
time.sleep(1) # Wait before retry
return result
st.set_page_config(page_title="YouTube Transcript Extractor", layout="wide")
st.title('π YouTube Transcript Extractor')
st.markdown("""
This app extracts transcripts from YouTube videos. Simply paste one or more YouTube URLs below.
- Supports multiple YouTube URL formats
- Handles videos with disabled subtitles
- Provides detailed error messages
- Allows downloading results as CSV
""")
# Text area for input
urls = st.text_area('YouTube URLs (one per line)',
height=150,
help='Enter YouTube URLs, one per line',
placeholder="https://www.youtube.com/watch?v=...")
if st.button('Extract Transcripts', type='primary'):
if urls:
# Split URLs into list and clean
url_list = [url.strip() for url in urls.split('\n') if url.strip()]
if url_list:
results = []
# Progress tracking
progress_text = "Extracting transcripts..."
progress_bar = st.progress(0, text=progress_text)
for i, url in enumerate(url_list):
video_id = extract_video_id(url)
if video_id:
# Get video info
title, author = get_video_info(url)
# Get transcript
transcript_result = get_transcript_with_retries(video_id)
results.append({
'URL': url,
'Video ID': video_id,
'Title': title,
'Channel': author,
'Status': 'Success' if transcript_result['success'] else 'Error',
'Language': transcript_result['language'] if transcript_result['success'] else None,
'Error': transcript_result['error'],
'Transcript': transcript_result['transcript'] if transcript_result['success'] else None
})
else:
results.append({
'URL': url,
'Video ID': None,
'Title': 'Unknown',
'Channel': 'Unknown',
'Status': 'Error',
'Language': None,
'Error': 'Invalid YouTube URL',
'Transcript': None
})
# Update progress
progress_bar.progress((i + 1) / len(url_list),
text=f"{progress_text} ({i + 1}/{len(url_list)})")
# Create DataFrame
df = pd.DataFrame(results)
# Display results in tabs
tab1, tab2 = st.tabs(["π Results Overview", "π Full Transcripts"])
with tab1:
st.dataframe(df[['URL', 'Title', 'Channel', 'Status', 'Language', 'Error']],
use_container_width=True)
with tab2:
for _, row in df.iterrows():
if row['Status'] == 'Success':
with st.expander(f"πΊ {row['Title']} ({row['Channel']})"):
st.text_area("Transcript:",
value=row['Transcript'],
height=200,
key=f"transcript_{row['Video ID']}")
# Download button
if not df.empty:
csv = df.to_csv(index=False)
st.download_button(
label="β¬οΈ Download results as CSV",
data=csv,
file_name="youtube_transcripts.csv",
mime="text/csv"
)
else:
st.error('Please enter valid YouTube URLs')
else:
st.warning('Please enter at least one YouTube URL')
# Add footer with information
st.markdown("---")
st.markdown("""
<div style='text-align: center'>
<p>π‘ <i>Note: This tool works best with videos that have enabled subtitles/transcripts.</i></p>
</div>
""", unsafe_allow_html=True) |