Spaces:

noumanjavaid
/

yt-transcript

Running

App Files Files Community

noumanjavaid commited on Jan 15

Commit

7aea2f1

verified ·

1 Parent(s): edbfc16

Create yt-app.py

Browse files

Files changed (1) hide show

yt-app.py +179 -0

yt-app.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import streamlit as st
+import pandas as pd
+from youtube_transcript_api import YouTubeTranscriptApi
+from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable
+from pytube import YouTube
+import re
+from typing import Dict, List, Optional, Tuple
+import time
+def extract_video_id(url: str) -> Optional[str]:
+    """Extract YouTube video ID from various URL formats."""
+    patterns = [
+        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
+        r'(?:embed\/)([0-9A-Za-z_-]{11})',
+        r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    return None
+def get_video_info(url: str) -> Tuple[str, str]:
+    """Get video title and author using pytube."""
+    try:
+        yt = YouTube(url)
+        return yt.title, yt.author
+    except Exception as e:
+        return "Unknown", "Unknown"
+def get_transcript_with_retries(video_id: str, max_retries: int = 3) -> Dict:
+    """Get transcript with multiple retries and error handling."""
+    result = {
+        'success': False,
+        'transcript': '',
+        'error': None,
+        'language': None
+    }
+    for attempt in range(max_retries):
+        try:
+            # First try to get available transcript list
+            transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+            # Try to get English transcript first
+            try:
+                transcript = transcript_list.find_transcript(['en'])
+            except NoTranscriptFound:
+                # If no English, get the first available transcript
+                transcript = transcript_list.find_transcript(['en-US', 'en-GB'])
+            # Get the actual transcript
+            transcript_data = transcript.fetch()
+            result['success'] = True
+            result['transcript'] = ' '.join([entry['text'] for entry in transcript_data])
+            result['language'] = transcript.language
+            return result
+        except TranscriptsDisabled:
+            result['error'] = "Transcripts are disabled for this video"
+            break
+        except VideoUnavailable:
+            result['error'] = "Video is unavailable"
+            break
+        except NoTranscriptFound:
+            result['error'] = "No transcripts found for this video"
+            break
+        except Exception as e:
+            if attempt == max_retries - 1:
+                result['error'] = f"Error: {str(e)}"
+            time.sleep(1)  # Wait before retry
+    return result
+st.set_page_config(page_title="YouTube Transcript Extractor", layout="wide")
+st.title('📝 YouTube Transcript Extractor')
+st.markdown("""
+This app extracts transcripts from YouTube videos. Simply paste one or more YouTube URLs below.
+- Supports multiple YouTube URL formats
+- Handles videos with disabled subtitles
+- Provides detailed error messages
+- Allows downloading results as CSV
+""")
+# Text area for input
+urls = st.text_area('YouTube URLs (one per line)',
+                    height=150,
+                    help='Enter YouTube URLs, one per line',
+                    placeholder="https://www.youtube.com/watch?v=...")
+if st.button('Extract Transcripts', type='primary'):
+    if urls:
+        # Split URLs into list and clean
+        url_list = [url.strip() for url in urls.split('\n') if url.strip()]
+        if url_list:
+            results = []
+            # Progress tracking
+            progress_text = "Extracting transcripts..."
+            progress_bar = st.progress(0, text=progress_text)
+            for i, url in enumerate(url_list):
+                video_id = extract_video_id(url)
+                if video_id:
+                    # Get video info
+                    title, author = get_video_info(url)
+                    # Get transcript
+                    transcript_result = get_transcript_with_retries(video_id)
+                    results.append({
+                        'URL': url,
+                        'Video ID': video_id,
+                        'Title': title,
+                        'Channel': author,
+                        'Status': 'Success' if transcript_result['success'] else 'Error',
+                        'Language': transcript_result['language'] if transcript_result['success'] else None,
+                        'Error': transcript_result['error'],
+                        'Transcript': transcript_result['transcript'] if transcript_result['success'] else None
+                    })
+                else:
+                    results.append({
+                        'URL': url,
+                        'Video ID': None,
+                        'Title': 'Unknown',
+                        'Channel': 'Unknown',
+                        'Status': 'Error',
+                        'Language': None,
+                        'Error': 'Invalid YouTube URL',
+                        'Transcript': None
+                    })
+                # Update progress
+                progress_bar.progress((i + 1) / len(url_list),
+                                   text=f"{progress_text} ({i + 1}/{len(url_list)})")
+            # Create DataFrame
+            df = pd.DataFrame(results)
+            # Display results in tabs
+            tab1, tab2 = st.tabs(["📊 Results Overview", "📑 Full Transcripts"])
+            with tab1:
+                st.dataframe(df[['URL', 'Title', 'Channel', 'Status', 'Language', 'Error']],
+                           use_container_width=True)
+            with tab2:
+                for _, row in df.iterrows():
+                    if row['Status'] == 'Success':
+                        with st.expander(f"📺 {row['Title']} ({row['Channel']})"):
+                            st.text_area("Transcript:",
+                                       value=row['Transcript'],
+                                       height=200,
+                                       key=f"transcript_{row['Video ID']}")
+            # Download button
+            if not df.empty:
+                csv = df.to_csv(index=False)
+                st.download_button(
+                    label="⬇️ Download results as CSV",
+                    data=csv,
+                    file_name="youtube_transcripts.csv",
+                    mime="text/csv"
+                )
+        else:
+            st.error('Please enter valid YouTube URLs')
+    else:
+        st.warning('Please enter at least one YouTube URL')
+# Add footer with information
+st.markdown("---")
+st.markdown("""
+<div style='text-align: center'>
+<p>💡 <i>Note: This tool works best with videos that have enabled subtitles/transcripts.</i></p>
+</div>
+""", unsafe_allow_html=True)