import streamlit as st import pandas as pd from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable from pytube import YouTube import re from typing import Dict, List, Optional, Tuple import time def extract_video_id(url: str) -> Optional[str]: """Extract YouTube video ID from various URL formats.""" patterns = [ r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', r'(?:embed\/)([0-9A-Za-z_-]{11})', r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})' ] for pattern in patterns: match = re.search(pattern, url) if match: return match.group(1) return None def get_video_info(url: str) -> Tuple[str, str]: """Get video title and author using pytube.""" try: yt = YouTube(url) return yt.title, yt.author except Exception as e: return "Unknown", "Unknown" def get_transcript_with_retries(video_id: str, max_retries: int = 3) -> Dict: """Get transcript with multiple retries and error handling.""" result = { 'success': False, 'transcript': '', 'error': None, 'language': None } for attempt in range(max_retries): try: # First try to get available transcript list transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) # Try to get English transcript first try: transcript = transcript_list.find_transcript(['en']) except NoTranscriptFound: # If no English, get the first available transcript transcript = transcript_list.find_transcript(['en-US', 'en-GB']) # Get the actual transcript transcript_data = transcript.fetch() result['success'] = True result['transcript'] = ' '.join([entry['text'] for entry in transcript_data]) result['language'] = transcript.language return result except TranscriptsDisabled: result['error'] = "Transcripts are disabled for this video" break except VideoUnavailable: result['error'] = "Video is unavailable" break except NoTranscriptFound: result['error'] = "No transcripts found for this video" break except Exception as e: if attempt == max_retries - 1: result['error'] = f"Error: {str(e)}" time.sleep(1) # Wait before retry return result st.set_page_config(page_title="YouTube Transcript Extractor", layout="wide") st.title('📝 YouTube Transcript Extractor') st.markdown(""" This app extracts transcripts from YouTube videos. Simply paste one or more YouTube URLs below. - Supports multiple YouTube URL formats - Handles videos with disabled subtitles - Provides detailed error messages - Allows downloading results as CSV """) # Text area for input urls = st.text_area('YouTube URLs (one per line)', height=150, help='Enter YouTube URLs, one per line', placeholder="https://www.youtube.com/watch?v=...") if st.button('Extract Transcripts', type='primary'): if urls: # Split URLs into list and clean url_list = [url.strip() for url in urls.split('\n') if url.strip()] if url_list: results = [] # Progress tracking progress_text = "Extracting transcripts..." progress_bar = st.progress(0, text=progress_text) for i, url in enumerate(url_list): video_id = extract_video_id(url) if video_id: # Get video info title, author = get_video_info(url) # Get transcript transcript_result = get_transcript_with_retries(video_id) results.append({ 'URL': url, 'Video ID': video_id, 'Title': title, 'Channel': author, 'Status': 'Success' if transcript_result['success'] else 'Error', 'Language': transcript_result['language'] if transcript_result['success'] else None, 'Error': transcript_result['error'], 'Transcript': transcript_result['transcript'] if transcript_result['success'] else None }) else: results.append({ 'URL': url, 'Video ID': None, 'Title': 'Unknown', 'Channel': 'Unknown', 'Status': 'Error', 'Language': None, 'Error': 'Invalid YouTube URL', 'Transcript': None }) # Update progress progress_bar.progress((i + 1) / len(url_list), text=f"{progress_text} ({i + 1}/{len(url_list)})") # Create DataFrame df = pd.DataFrame(results) # Display results in tabs tab1, tab2 = st.tabs(["📊 Results Overview", "📑 Full Transcripts"]) with tab1: st.dataframe(df[['URL', 'Title', 'Channel', 'Status', 'Language', 'Error']], use_container_width=True) with tab2: for _, row in df.iterrows(): if row['Status'] == 'Success': with st.expander(f"📺 {row['Title']} ({row['Channel']})"): st.text_area("Transcript:", value=row['Transcript'], height=200, key=f"transcript_{row['Video ID']}") # Download button if not df.empty: csv = df.to_csv(index=False) st.download_button( label="⬇️ Download results as CSV", data=csv, file_name="youtube_transcripts.csv", mime="text/csv" ) else: st.error('Please enter valid YouTube URLs') else: st.warning('Please enter at least one YouTube URL') # Add footer with information st.markdown("---") st.markdown("""

💡 Note: This tool works best with videos that have enabled subtitles/transcripts.

""", unsafe_allow_html=True)