File size: 3,415 Bytes
26b0eae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import streamlit as st
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
import re
from io import StringIO

def extract_video_id(url):
    """Extract YouTube video ID from various URL formats"""
    patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/)([0-9A-Za-z_-]{11})',
        r'(?:youtu\.be\/)([0-9A-Za-z_-]{11})'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

def get_transcript(video_id):
    """Get transcript for a single video"""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return transcript
    except Exception as e:
        return str(e)

st.title('YouTube Transcript Extractor')
st.write('Enter YouTube video URLs (one per line) to extract their transcripts.')

# Text area for input
urls = st.text_area('YouTube URLs', height=150,
                    help='Enter one YouTube URL per line')

if st.button('Extract Transcripts'):
    if urls:
        # Split URLs into list
        url_list = urls.split('\n')
        url_list = [url.strip() for url in url_list if url.strip()]
        
        if url_list:
            results = []
            
            # Progress bar
            progress_bar = st.progress(0)
            
            for i, url in enumerate(url_list):
                video_id = extract_video_id(url)
                if video_id:
                    transcript = get_transcript(video_id)
                    
                    if isinstance(transcript, list):
                        # Successful transcript extraction
                        full_text = ' '.join([entry['text'] for entry in transcript])
                        results.append({
                            'URL': url,
                            'Video ID': video_id,
                            'Status': 'Success',
                            'Transcript': full_text
                        })
                    else:
                        # Error occurred
                        results.append({
                            'URL': url,
                            'Video ID': video_id,
                            'Status': 'Error',
                            'Transcript': transcript
                        })
                else:
                    results.append({
                        'URL': url,
                        'Video ID': None,
                        'Status': 'Error',
                        'Transcript': 'Invalid YouTube URL'
                    })
                
                # Update progress bar
                progress_bar.progress((i + 1) / len(url_list))
            
            # Create DataFrame
            df = pd.DataFrame(results)
            
            # Display results
            st.subheader('Results')
            st.dataframe(df[['URL', 'Status', 'Transcript']])
            
            # Download button
            if not df.empty:
                csv = df.to_csv(index=False)
                st.download_button(
                    label="Download transcripts as CSV",
                    data=csv,
                    file_name="youtube_transcripts.csv",
                    mime="text/csv"
                )
        else:
            st.error('Please enter valid YouTube URLs')
    else:
        st.warning('Please enter at least one YouTube URL')