noumanjavaid commited on
Commit
7aea2f1
Β·
verified Β·
1 Parent(s): edbfc16

Create yt-app.py

Browse files
Files changed (1) hide show
  1. yt-app.py +179 -0
yt-app.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+ from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable
5
+ from pytube import YouTube
6
+ import re
7
+ from typing import Dict, List, Optional, Tuple
8
+ import time
9
+
10
+ def extract_video_id(url: str) -> Optional[str]:
11
+ """Extract YouTube video ID from various URL formats."""
12
+ patterns = [
13
+ r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
14
+ r'(?:embed\/)([0-9A-Za-z_-]{11})',
15
+ r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
16
+ ]
17
+
18
+ for pattern in patterns:
19
+ match = re.search(pattern, url)
20
+ if match:
21
+ return match.group(1)
22
+ return None
23
+
24
+ def get_video_info(url: str) -> Tuple[str, str]:
25
+ """Get video title and author using pytube."""
26
+ try:
27
+ yt = YouTube(url)
28
+ return yt.title, yt.author
29
+ except Exception as e:
30
+ return "Unknown", "Unknown"
31
+
32
+ def get_transcript_with_retries(video_id: str, max_retries: int = 3) -> Dict:
33
+ """Get transcript with multiple retries and error handling."""
34
+ result = {
35
+ 'success': False,
36
+ 'transcript': '',
37
+ 'error': None,
38
+ 'language': None
39
+ }
40
+
41
+ for attempt in range(max_retries):
42
+ try:
43
+ # First try to get available transcript list
44
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
45
+
46
+ # Try to get English transcript first
47
+ try:
48
+ transcript = transcript_list.find_transcript(['en'])
49
+ except NoTranscriptFound:
50
+ # If no English, get the first available transcript
51
+ transcript = transcript_list.find_transcript(['en-US', 'en-GB'])
52
+
53
+ # Get the actual transcript
54
+ transcript_data = transcript.fetch()
55
+ result['success'] = True
56
+ result['transcript'] = ' '.join([entry['text'] for entry in transcript_data])
57
+ result['language'] = transcript.language
58
+ return result
59
+
60
+ except TranscriptsDisabled:
61
+ result['error'] = "Transcripts are disabled for this video"
62
+ break
63
+ except VideoUnavailable:
64
+ result['error'] = "Video is unavailable"
65
+ break
66
+ except NoTranscriptFound:
67
+ result['error'] = "No transcripts found for this video"
68
+ break
69
+ except Exception as e:
70
+ if attempt == max_retries - 1:
71
+ result['error'] = f"Error: {str(e)}"
72
+ time.sleep(1) # Wait before retry
73
+
74
+ return result
75
+
76
+ st.set_page_config(page_title="YouTube Transcript Extractor", layout="wide")
77
+
78
+ st.title('πŸ“ YouTube Transcript Extractor')
79
+ st.markdown("""
80
+ This app extracts transcripts from YouTube videos. Simply paste one or more YouTube URLs below.
81
+ - Supports multiple YouTube URL formats
82
+ - Handles videos with disabled subtitles
83
+ - Provides detailed error messages
84
+ - Allows downloading results as CSV
85
+ """)
86
+
87
+ # Text area for input
88
+ urls = st.text_area('YouTube URLs (one per line)',
89
+ height=150,
90
+ help='Enter YouTube URLs, one per line',
91
+ placeholder="https://www.youtube.com/watch?v=...")
92
+
93
+ if st.button('Extract Transcripts', type='primary'):
94
+ if urls:
95
+ # Split URLs into list and clean
96
+ url_list = [url.strip() for url in urls.split('\n') if url.strip()]
97
+
98
+ if url_list:
99
+ results = []
100
+
101
+ # Progress tracking
102
+ progress_text = "Extracting transcripts..."
103
+ progress_bar = st.progress(0, text=progress_text)
104
+
105
+ for i, url in enumerate(url_list):
106
+ video_id = extract_video_id(url)
107
+ if video_id:
108
+ # Get video info
109
+ title, author = get_video_info(url)
110
+
111
+ # Get transcript
112
+ transcript_result = get_transcript_with_retries(video_id)
113
+
114
+ results.append({
115
+ 'URL': url,
116
+ 'Video ID': video_id,
117
+ 'Title': title,
118
+ 'Channel': author,
119
+ 'Status': 'Success' if transcript_result['success'] else 'Error',
120
+ 'Language': transcript_result['language'] if transcript_result['success'] else None,
121
+ 'Error': transcript_result['error'],
122
+ 'Transcript': transcript_result['transcript'] if transcript_result['success'] else None
123
+ })
124
+ else:
125
+ results.append({
126
+ 'URL': url,
127
+ 'Video ID': None,
128
+ 'Title': 'Unknown',
129
+ 'Channel': 'Unknown',
130
+ 'Status': 'Error',
131
+ 'Language': None,
132
+ 'Error': 'Invalid YouTube URL',
133
+ 'Transcript': None
134
+ })
135
+
136
+ # Update progress
137
+ progress_bar.progress((i + 1) / len(url_list),
138
+ text=f"{progress_text} ({i + 1}/{len(url_list)})")
139
+
140
+ # Create DataFrame
141
+ df = pd.DataFrame(results)
142
+
143
+ # Display results in tabs
144
+ tab1, tab2 = st.tabs(["πŸ“Š Results Overview", "πŸ“‘ Full Transcripts"])
145
+
146
+ with tab1:
147
+ st.dataframe(df[['URL', 'Title', 'Channel', 'Status', 'Language', 'Error']],
148
+ use_container_width=True)
149
+
150
+ with tab2:
151
+ for _, row in df.iterrows():
152
+ if row['Status'] == 'Success':
153
+ with st.expander(f"πŸ“Ί {row['Title']} ({row['Channel']})"):
154
+ st.text_area("Transcript:",
155
+ value=row['Transcript'],
156
+ height=200,
157
+ key=f"transcript_{row['Video ID']}")
158
+
159
+ # Download button
160
+ if not df.empty:
161
+ csv = df.to_csv(index=False)
162
+ st.download_button(
163
+ label="⬇️ Download results as CSV",
164
+ data=csv,
165
+ file_name="youtube_transcripts.csv",
166
+ mime="text/csv"
167
+ )
168
+ else:
169
+ st.error('Please enter valid YouTube URLs')
170
+ else:
171
+ st.warning('Please enter at least one YouTube URL')
172
+
173
+ # Add footer with information
174
+ st.markdown("---")
175
+ st.markdown("""
176
+ <div style='text-align: center'>
177
+ <p>πŸ’‘ <i>Note: This tool works best with videos that have enabled subtitles/transcripts.</i></p>
178
+ </div>
179
+ """, unsafe_allow_html=True)