Spaces:

noumanjavaid
/

yt-transcript

Running

App Files Files Community

yt-transcript / yt-app.py

noumanjavaid

Create yt-app.py

7aea2f1 verified 6 months ago

raw

history blame contribute delete

6.86 kB

	import streamlit as st
	import pandas as pd
	from youtube_transcript_api import YouTubeTranscriptApi
	from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, VideoUnavailable
	from pytube import YouTube
	import re
	from typing import Dict, List, Optional, Tuple
	import time

	def extract_video_id(url: str) -> Optional[str]:
	"""Extract YouTube video ID from various URL formats."""
	patterns = [
	r'(?:v=\|\/)([0-9A-Za-z_-]{11}).*',
	r'(?:embed\/)([0-9A-Za-z_-]{11})',
	r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
	]

	for pattern in patterns:
	match = re.search(pattern, url)
	if match:
	return match.group(1)
	return None

	def get_video_info(url: str) -> Tuple[str, str]:
	"""Get video title and author using pytube."""
	try:
	yt = YouTube(url)
	return yt.title, yt.author
	except Exception as e:
	return "Unknown", "Unknown"

	def get_transcript_with_retries(video_id: str, max_retries: int = 3) -> Dict:
	"""Get transcript with multiple retries and error handling."""
	result = {
	'success': False,
	'transcript': '',
	'error': None,
	'language': None
	}

	for attempt in range(max_retries):
	try:
	# First try to get available transcript list
	transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)

	# Try to get English transcript first
	try:
	transcript = transcript_list.find_transcript(['en'])
	except NoTranscriptFound:
	# If no English, get the first available transcript
	transcript = transcript_list.find_transcript(['en-US', 'en-GB'])

	# Get the actual transcript
	transcript_data = transcript.fetch()
	result['success'] = True
	result['transcript'] = ' '.join([entry['text'] for entry in transcript_data])
	result['language'] = transcript.language
	return result

	except TranscriptsDisabled:
	result['error'] = "Transcripts are disabled for this video"
	break
	except VideoUnavailable:
	result['error'] = "Video is unavailable"
	break
	except NoTranscriptFound:
	result['error'] = "No transcripts found for this video"
	break
	except Exception as e:
	if attempt == max_retries - 1:
	result['error'] = f"Error: {str(e)}"
	time.sleep(1) # Wait before retry

	return result

	st.set_page_config(page_title="YouTube Transcript Extractor", layout="wide")

	st.title('📝 YouTube Transcript Extractor')
	st.markdown("""
	This app extracts transcripts from YouTube videos. Simply paste one or more YouTube URLs below.
	- Supports multiple YouTube URL formats
	- Handles videos with disabled subtitles
	- Provides detailed error messages
	- Allows downloading results as CSV
	""")

	# Text area for input
	urls = st.text_area('YouTube URLs (one per line)',
	height=150,
	help='Enter YouTube URLs, one per line',
	placeholder="https://www.youtube.com/watch?v=...")

	if st.button('Extract Transcripts', type='primary'):
	if urls:
	# Split URLs into list and clean
	url_list = [url.strip() for url in urls.split('\n') if url.strip()]

	if url_list:
	results = []

	# Progress tracking
	progress_text = "Extracting transcripts..."
	progress_bar = st.progress(0, text=progress_text)

	for i, url in enumerate(url_list):
	video_id = extract_video_id(url)
	if video_id:
	# Get video info
	title, author = get_video_info(url)

	# Get transcript
	transcript_result = get_transcript_with_retries(video_id)

	results.append({
	'URL': url,
	'Video ID': video_id,
	'Title': title,
	'Channel': author,
	'Status': 'Success' if transcript_result['success'] else 'Error',
	'Language': transcript_result['language'] if transcript_result['success'] else None,
	'Error': transcript_result['error'],
	'Transcript': transcript_result['transcript'] if transcript_result['success'] else None
	})
	else:
	results.append({
	'URL': url,
	'Video ID': None,
	'Title': 'Unknown',
	'Channel': 'Unknown',
	'Status': 'Error',
	'Language': None,
	'Error': 'Invalid YouTube URL',
	'Transcript': None
	})

	# Update progress
	progress_bar.progress((i + 1) / len(url_list),
	text=f"{progress_text} ({i + 1}/{len(url_list)})")

	# Create DataFrame
	df = pd.DataFrame(results)

	# Display results in tabs
	tab1, tab2 = st.tabs(["📊 Results Overview", "📑 Full Transcripts"])

	with tab1:
	st.dataframe(df[['URL', 'Title', 'Channel', 'Status', 'Language', 'Error']],
	use_container_width=True)

	with tab2:
	for _, row in df.iterrows():
	if row['Status'] == 'Success':
	with st.expander(f"📺 {row['Title']} ({row['Channel']})"):
	st.text_area("Transcript:",
	value=row['Transcript'],
	height=200,
	key=f"transcript_{row['Video ID']}")

	# Download button
	if not df.empty:
	csv = df.to_csv(index=False)
	st.download_button(
	label="⬇️ Download results as CSV",
	data=csv,
	file_name="youtube_transcripts.csv",
	mime="text/csv"
	)
	else:
	st.error('Please enter valid YouTube URLs')
	else:
	st.warning('Please enter at least one YouTube URL')

	# Add footer with information
	st.markdown("---")
	st.markdown("""
	<div style='text-align: center'>
	<p>💡 <i>Note: This tool works best with videos that have enabled subtitles/transcripts.</i></p>
	</div>
	""", unsafe_allow_html=True)