Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
from youtube_transcript_api import YouTubeTranscriptApi | |
import re | |
from io import StringIO | |
def extract_video_id(url): | |
"""Extract YouTube video ID from various URL formats""" | |
patterns = [ | |
r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', | |
r'(?:embed\/)([0-9A-Za-z_-]{11})', | |
r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})' | |
] | |
for pattern in patterns: | |
match = re.search(pattern, url) | |
if match: | |
return match.group(1) | |
return None | |
def get_transcript(video_id): | |
"""Get transcript for a single video""" | |
try: | |
transcript = YouTubeTranscriptApi.get_transcript(video_id) | |
return transcript | |
except Exception as e: | |
return str(e) | |
st.title('YouTube Transcript Extractor') | |
st.write('Enter YouTube video URLs (one per line) to extract their transcripts.') | |
# Text area for input | |
urls = st.text_area('YouTube URLs', height=150, | |
help='Enter one YouTube URL per line') | |
if st.button('Extract Transcripts'): | |
if urls: | |
# Split URLs into list | |
url_list = urls.split('\n') | |
url_list = [url.strip() for url in url_list if url.strip()] | |
if url_list: | |
results = [] | |
# Progress bar | |
progress_bar = st.progress(0) | |
for i, url in enumerate(url_list): | |
video_id = extract_video_id(url) | |
if video_id: | |
transcript = get_transcript(video_id) | |
if isinstance(transcript, list): | |
# Successful transcript extraction | |
full_text = ' '.join([entry['text'] for entry in transcript]) | |
results.append({ | |
'URL': url, | |
'Video ID': video_id, | |
'Status': 'Success', | |
'Transcript': full_text | |
}) | |
else: | |
# Error occurred | |
results.append({ | |
'URL': url, | |
'Video ID': video_id, | |
'Status': 'Error', | |
'Transcript': transcript | |
}) | |
else: | |
results.append({ | |
'URL': url, | |
'Video ID': None, | |
'Status': 'Error', | |
'Transcript': 'Invalid YouTube URL' | |
}) | |
# Update progress bar | |
progress_bar.progress((i + 1) / len(url_list)) | |
# Create DataFrame | |
df = pd.DataFrame(results) | |
# Display results | |
st.subheader('Results') | |
st.dataframe(df[['URL', 'Status', 'Transcript']]) | |
# Download button | |
if not df.empty: | |
csv = df.to_csv(index=False) | |
st.download_button( | |
label="Download transcripts as CSV", | |
data=csv, | |
file_name="youtube_transcripts.csv", | |
mime="text/csv" | |
) | |
else: | |
st.error('Please enter valid YouTube URLs') | |
else: | |
st.warning('Please enter at least one YouTube URL') |