Spaces:
Running
Running
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
4 |
+
import re
|
5 |
+
from io import StringIO
|
6 |
+
|
7 |
+
def extract_video_id(url):
|
8 |
+
"""Extract YouTube video ID from various URL formats"""
|
9 |
+
patterns = [
|
10 |
+
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
|
11 |
+
r'(?:embed\/)([0-9A-Za-z_-]{11})',
|
12 |
+
r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
|
13 |
+
]
|
14 |
+
|
15 |
+
for pattern in patterns:
|
16 |
+
match = re.search(pattern, url)
|
17 |
+
if match:
|
18 |
+
return match.group(1)
|
19 |
+
return None
|
20 |
+
|
21 |
+
def get_transcript(video_id):
|
22 |
+
"""Get transcript for a single video"""
|
23 |
+
try:
|
24 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id)
|
25 |
+
return transcript
|
26 |
+
except Exception as e:
|
27 |
+
return str(e)
|
28 |
+
|
29 |
+
st.title('YouTube Transcript Extractor')
|
30 |
+
st.write('Enter YouTube video URLs (one per line) to extract their transcripts.')
|
31 |
+
|
32 |
+
# Text area for input
|
33 |
+
urls = st.text_area('YouTube URLs', height=150,
|
34 |
+
help='Enter one YouTube URL per line')
|
35 |
+
|
36 |
+
if st.button('Extract Transcripts'):
|
37 |
+
if urls:
|
38 |
+
# Split URLs into list
|
39 |
+
url_list = urls.split('\n')
|
40 |
+
url_list = [url.strip() for url in url_list if url.strip()]
|
41 |
+
|
42 |
+
if url_list:
|
43 |
+
results = []
|
44 |
+
|
45 |
+
# Progress bar
|
46 |
+
progress_bar = st.progress(0)
|
47 |
+
|
48 |
+
for i, url in enumerate(url_list):
|
49 |
+
video_id = extract_video_id(url)
|
50 |
+
if video_id:
|
51 |
+
transcript = get_transcript(video_id)
|
52 |
+
|
53 |
+
if isinstance(transcript, list):
|
54 |
+
# Successful transcript extraction
|
55 |
+
full_text = ' '.join([entry['text'] for entry in transcript])
|
56 |
+
results.append({
|
57 |
+
'URL': url,
|
58 |
+
'Video ID': video_id,
|
59 |
+
'Status': 'Success',
|
60 |
+
'Transcript': full_text
|
61 |
+
})
|
62 |
+
else:
|
63 |
+
# Error occurred
|
64 |
+
results.append({
|
65 |
+
'URL': url,
|
66 |
+
'Video ID': video_id,
|
67 |
+
'Status': 'Error',
|
68 |
+
'Transcript': transcript
|
69 |
+
})
|
70 |
+
else:
|
71 |
+
results.append({
|
72 |
+
'URL': url,
|
73 |
+
'Video ID': None,
|
74 |
+
'Status': 'Error',
|
75 |
+
'Transcript': 'Invalid YouTube URL'
|
76 |
+
})
|
77 |
+
|
78 |
+
# Update progress bar
|
79 |
+
progress_bar.progress((i + 1) / len(url_list))
|
80 |
+
|
81 |
+
# Create DataFrame
|
82 |
+
df = pd.DataFrame(results)
|
83 |
+
|
84 |
+
# Display results
|
85 |
+
st.subheader('Results')
|
86 |
+
st.dataframe(df[['URL', 'Status', 'Transcript']])
|
87 |
+
|
88 |
+
# Download button
|
89 |
+
if not df.empty:
|
90 |
+
csv = df.to_csv(index=False)
|
91 |
+
st.download_button(
|
92 |
+
label="Download transcripts as CSV",
|
93 |
+
data=csv,
|
94 |
+
file_name="youtube_transcripts.csv",
|
95 |
+
mime="text/csv"
|
96 |
+
)
|
97 |
+
else:
|
98 |
+
st.error('Please enter valid YouTube URLs')
|
99 |
+
else:
|
100 |
+
st.warning('Please enter at least one YouTube URL')
|