Spaces:

noumanjavaid
/

yt-transcript

Running

File size: 6,858 Bytes

7aea2f1

import streamlit as st
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable
from pytube import YouTube
import re
from typing import Dict, List, Optional, Tuple
import time

def extract_video_id(url: str) -> Optional[str]:
    """Extract YouTube video ID from various URL formats."""
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/)([0-9A-Za-z_-]{11})',
        r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_video_info(url: str) -> Tuple[str, str]:
    """Get video title and author using pytube."""
    try:
        yt = YouTube(url)
        return yt.title, yt.author
    except Exception as e:
        return "Unknown", "Unknown"

def get_transcript_with_retries(video_id: str, max_retries: int = 3) -> Dict:
    """Get transcript with multiple retries and error handling."""
    result = {
        'success': False,
        'transcript': '',
        'error': None,
        'language': None
    }
    
    for attempt in range(max_retries):
        try:
            # First try to get available transcript list
            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
            
            # Try to get English transcript first
            try:
                transcript = transcript_list.find_transcript(['en'])
            except NoTranscriptFound:
                # If no English, get the first available transcript
                transcript = transcript_list.find_transcript(['en-US', 'en-GB'])
            
            # Get the actual transcript
            transcript_data = transcript.fetch()
            result['success'] = True
            result['transcript'] = ' '.join([entry['text'] for entry in transcript_data])
            result['language'] = transcript.language
            return result
            
        except TranscriptsDisabled:
            result['error'] = "Transcripts are disabled for this video"
            break
        except VideoUnavailable:
            result['error'] = "Video is unavailable"
            break
        except NoTranscriptFound:
            result['error'] = "No transcripts found for this video"
            break
        except Exception as e:
            if attempt == max_retries - 1:
                result['error'] = f"Error: {str(e)}"
            time.sleep(1)  # Wait before retry
            
    return result

st.set_page_config(page_title="YouTube Transcript Extractor", layout="wide")

st.title('📝 YouTube Transcript Extractor')
st.markdown("""
This app extracts transcripts from YouTube videos. Simply paste one or more YouTube URLs below.
- Supports multiple YouTube URL formats
- Handles videos with disabled subtitles
- Provides detailed error messages
- Allows downloading results as CSV
""")

# Text area for input
urls = st.text_area('YouTube URLs (one per line)', 
                    height=150,
                    help='Enter YouTube URLs, one per line',
                    placeholder="https://www.youtube.com/watch?v=...")

if st.button('Extract Transcripts', type='primary'):
    if urls:
        # Split URLs into list and clean
        url_list = [url.strip() for url in urls.split('\n') if url.strip()]
        
        if url_list:
            results = []
            
            # Progress tracking
            progress_text = "Extracting transcripts..."
            progress_bar = st.progress(0, text=progress_text)
            
            for i, url in enumerate(url_list):
                video_id = extract_video_id(url)
                if video_id:
                    # Get video info
                    title, author = get_video_info(url)
                    
                    # Get transcript
                    transcript_result = get_transcript_with_retries(video_id)
                    
                    results.append({
                        'URL': url,
                        'Video ID': video_id,
                        'Title': title,
                        'Channel': author,
                        'Status': 'Success' if transcript_result['success'] else 'Error',
                        'Language': transcript_result['language'] if transcript_result['success'] else None,
                        'Error': transcript_result['error'],
                        'Transcript': transcript_result['transcript'] if transcript_result['success'] else None
                    })
                else:
                    results.append({
                        'URL': url,
                        'Video ID': None,
                        'Title': 'Unknown',
                        'Channel': 'Unknown',
                        'Status': 'Error',
                        'Language': None,
                        'Error': 'Invalid YouTube URL',
                        'Transcript': None
                    })
                
                # Update progress
                progress_bar.progress((i + 1) / len(url_list), 
                                   text=f"{progress_text} ({i + 1}/{len(url_list)})")
            
            # Create DataFrame
            df = pd.DataFrame(results)
            
            # Display results in tabs
            tab1, tab2 = st.tabs(["📊 Results Overview", "📑 Full Transcripts"])
            
            with tab1:
                st.dataframe(df[['URL', 'Title', 'Channel', 'Status', 'Language', 'Error']], 
                           use_container_width=True)
            
            with tab2:
                for _, row in df.iterrows():
                    if row['Status'] == 'Success':
                        with st.expander(f"📺 {row['Title']} ({row['Channel']})"):
                            st.text_area("Transcript:", 
                                       value=row['Transcript'], 
                                       height=200,
                                       key=f"transcript_{row['Video ID']}")
            
            # Download button
            if not df.empty:
                csv = df.to_csv(index=False)
                st.download_button(
                    label="⬇️ Download results as CSV",
                    data=csv,
                    file_name="youtube_transcripts.csv",
                    mime="text/csv"
                )
        else:
            st.error('Please enter valid YouTube URLs')
    else:
        st.warning('Please enter at least one YouTube URL')

# Add footer with information
st.markdown("---")
st.markdown("""
<div style='text-align: center'>
<p>💡 <i>Note: This tool works best with videos that have enabled subtitles/transcripts.</i></p>
</div>
""", unsafe_allow_html=True)