Spaces:
Running
Running
Create yt-app.py
Browse files
yt-app.py
ADDED
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
4 |
+
from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable
|
5 |
+
from pytube import YouTube
|
6 |
+
import re
|
7 |
+
from typing import Dict, List, Optional, Tuple
|
8 |
+
import time
|
9 |
+
|
10 |
+
def extract_video_id(url: str) -> Optional[str]:
|
11 |
+
"""Extract YouTube video ID from various URL formats."""
|
12 |
+
patterns = [
|
13 |
+
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
|
14 |
+
r'(?:embed\/)([0-9A-Za-z_-]{11})',
|
15 |
+
r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
|
16 |
+
]
|
17 |
+
|
18 |
+
for pattern in patterns:
|
19 |
+
match = re.search(pattern, url)
|
20 |
+
if match:
|
21 |
+
return match.group(1)
|
22 |
+
return None
|
23 |
+
|
24 |
+
def get_video_info(url: str) -> Tuple[str, str]:
|
25 |
+
"""Get video title and author using pytube."""
|
26 |
+
try:
|
27 |
+
yt = YouTube(url)
|
28 |
+
return yt.title, yt.author
|
29 |
+
except Exception as e:
|
30 |
+
return "Unknown", "Unknown"
|
31 |
+
|
32 |
+
def get_transcript_with_retries(video_id: str, max_retries: int = 3) -> Dict:
|
33 |
+
"""Get transcript with multiple retries and error handling."""
|
34 |
+
result = {
|
35 |
+
'success': False,
|
36 |
+
'transcript': '',
|
37 |
+
'error': None,
|
38 |
+
'language': None
|
39 |
+
}
|
40 |
+
|
41 |
+
for attempt in range(max_retries):
|
42 |
+
try:
|
43 |
+
# First try to get available transcript list
|
44 |
+
transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
|
45 |
+
|
46 |
+
# Try to get English transcript first
|
47 |
+
try:
|
48 |
+
transcript = transcript_list.find_transcript(['en'])
|
49 |
+
except NoTranscriptFound:
|
50 |
+
# If no English, get the first available transcript
|
51 |
+
transcript = transcript_list.find_transcript(['en-US', 'en-GB'])
|
52 |
+
|
53 |
+
# Get the actual transcript
|
54 |
+
transcript_data = transcript.fetch()
|
55 |
+
result['success'] = True
|
56 |
+
result['transcript'] = ' '.join([entry['text'] for entry in transcript_data])
|
57 |
+
result['language'] = transcript.language
|
58 |
+
return result
|
59 |
+
|
60 |
+
except TranscriptsDisabled:
|
61 |
+
result['error'] = "Transcripts are disabled for this video"
|
62 |
+
break
|
63 |
+
except VideoUnavailable:
|
64 |
+
result['error'] = "Video is unavailable"
|
65 |
+
break
|
66 |
+
except NoTranscriptFound:
|
67 |
+
result['error'] = "No transcripts found for this video"
|
68 |
+
break
|
69 |
+
except Exception as e:
|
70 |
+
if attempt == max_retries - 1:
|
71 |
+
result['error'] = f"Error: {str(e)}"
|
72 |
+
time.sleep(1) # Wait before retry
|
73 |
+
|
74 |
+
return result
|
75 |
+
|
76 |
+
st.set_page_config(page_title="YouTube Transcript Extractor", layout="wide")
|
77 |
+
|
78 |
+
st.title('π YouTube Transcript Extractor')
|
79 |
+
st.markdown("""
|
80 |
+
This app extracts transcripts from YouTube videos. Simply paste one or more YouTube URLs below.
|
81 |
+
- Supports multiple YouTube URL formats
|
82 |
+
- Handles videos with disabled subtitles
|
83 |
+
- Provides detailed error messages
|
84 |
+
- Allows downloading results as CSV
|
85 |
+
""")
|
86 |
+
|
87 |
+
# Text area for input
|
88 |
+
urls = st.text_area('YouTube URLs (one per line)',
|
89 |
+
height=150,
|
90 |
+
help='Enter YouTube URLs, one per line',
|
91 |
+
placeholder="https://www.youtube.com/watch?v=...")
|
92 |
+
|
93 |
+
if st.button('Extract Transcripts', type='primary'):
|
94 |
+
if urls:
|
95 |
+
# Split URLs into list and clean
|
96 |
+
url_list = [url.strip() for url in urls.split('\n') if url.strip()]
|
97 |
+
|
98 |
+
if url_list:
|
99 |
+
results = []
|
100 |
+
|
101 |
+
# Progress tracking
|
102 |
+
progress_text = "Extracting transcripts..."
|
103 |
+
progress_bar = st.progress(0, text=progress_text)
|
104 |
+
|
105 |
+
for i, url in enumerate(url_list):
|
106 |
+
video_id = extract_video_id(url)
|
107 |
+
if video_id:
|
108 |
+
# Get video info
|
109 |
+
title, author = get_video_info(url)
|
110 |
+
|
111 |
+
# Get transcript
|
112 |
+
transcript_result = get_transcript_with_retries(video_id)
|
113 |
+
|
114 |
+
results.append({
|
115 |
+
'URL': url,
|
116 |
+
'Video ID': video_id,
|
117 |
+
'Title': title,
|
118 |
+
'Channel': author,
|
119 |
+
'Status': 'Success' if transcript_result['success'] else 'Error',
|
120 |
+
'Language': transcript_result['language'] if transcript_result['success'] else None,
|
121 |
+
'Error': transcript_result['error'],
|
122 |
+
'Transcript': transcript_result['transcript'] if transcript_result['success'] else None
|
123 |
+
})
|
124 |
+
else:
|
125 |
+
results.append({
|
126 |
+
'URL': url,
|
127 |
+
'Video ID': None,
|
128 |
+
'Title': 'Unknown',
|
129 |
+
'Channel': 'Unknown',
|
130 |
+
'Status': 'Error',
|
131 |
+
'Language': None,
|
132 |
+
'Error': 'Invalid YouTube URL',
|
133 |
+
'Transcript': None
|
134 |
+
})
|
135 |
+
|
136 |
+
# Update progress
|
137 |
+
progress_bar.progress((i + 1) / len(url_list),
|
138 |
+
text=f"{progress_text} ({i + 1}/{len(url_list)})")
|
139 |
+
|
140 |
+
# Create DataFrame
|
141 |
+
df = pd.DataFrame(results)
|
142 |
+
|
143 |
+
# Display results in tabs
|
144 |
+
tab1, tab2 = st.tabs(["π Results Overview", "π Full Transcripts"])
|
145 |
+
|
146 |
+
with tab1:
|
147 |
+
st.dataframe(df[['URL', 'Title', 'Channel', 'Status', 'Language', 'Error']],
|
148 |
+
use_container_width=True)
|
149 |
+
|
150 |
+
with tab2:
|
151 |
+
for _, row in df.iterrows():
|
152 |
+
if row['Status'] == 'Success':
|
153 |
+
with st.expander(f"πΊ {row['Title']} ({row['Channel']})"):
|
154 |
+
st.text_area("Transcript:",
|
155 |
+
value=row['Transcript'],
|
156 |
+
height=200,
|
157 |
+
key=f"transcript_{row['Video ID']}")
|
158 |
+
|
159 |
+
# Download button
|
160 |
+
if not df.empty:
|
161 |
+
csv = df.to_csv(index=False)
|
162 |
+
st.download_button(
|
163 |
+
label="β¬οΈ Download results as CSV",
|
164 |
+
data=csv,
|
165 |
+
file_name="youtube_transcripts.csv",
|
166 |
+
mime="text/csv"
|
167 |
+
)
|
168 |
+
else:
|
169 |
+
st.error('Please enter valid YouTube URLs')
|
170 |
+
else:
|
171 |
+
st.warning('Please enter at least one YouTube URL')
|
172 |
+
|
173 |
+
# Add footer with information
|
174 |
+
st.markdown("---")
|
175 |
+
st.markdown("""
|
176 |
+
<div style='text-align: center'>
|
177 |
+
<p>π‘ <i>Note: This tool works best with videos that have enabled subtitles/transcripts.</i></p>
|
178 |
+
</div>
|
179 |
+
""", unsafe_allow_html=True)
|