File size: 6,214 Bytes
2fc5eb1
 
 
 
 
 
 
 
f8b6c23
2fc5eb1
 
 
 
 
 
 
 
f8b6c23
 
 
2fc5eb1
 
 
 
 
 
f8b6c23
2fc5eb1
 
f8b6c23
2fc5eb1
 
 
 
 
 
 
 
 
 
 
 
f8b6c23
2fc5eb1
f8b6c23
 
 
 
48c4ffc
 
 
 
 
 
 
f8b6c23
 
 
 
 
 
 
 
 
 
 
 
8f7a009
f8b6c23
 
 
8f7a009
f8b6c23
 
 
 
 
2fc5eb1
 
 
48c4ffc
2fc5eb1
 
 
f8b6c23
2fc5eb1
 
 
 
 
 
f8b6c23
2fc5eb1
 
f8b6c23
 
 
2fc5eb1
f8b6c23
2fc5eb1
 
f8b6c23
2fc5eb1
 
f8b6c23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fc5eb1
f8b6c23
 
2fc5eb1
f8b6c23
2fc5eb1
 
 
 
 
 
 
 
 
 
 
 
f8b6c23
2fc5eb1
 
f8b6c23
 
2fc5eb1
 
 
f8b6c23
2fc5eb1
 
f8b6c23
 
2fc5eb1
 
 
 
 
 
 
 
f8b6c23
2fc5eb1
 
 
f8b6c23
 
2fc5eb1
f8b6c23
2fc5eb1
 
f8b6c23
2fc5eb1
 
 
f8b6c23
2fc5eb1
 
f8b6c23
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import os
import validators
import streamlit as st
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
import yt_dlp
import requests
from bs4 import BeautifulSoup
import re

# Load environment variables
load_dotenv()

# Streamlit App
st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜")
st.title("🦜 LangChain: Summarize Text From YT or Website")
st.subheader("Summarize URL")

# Get API Key & URL input
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    st.error("GROQ API Key not found. Please check your environment variables.")

generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed")

# LangChain Model with Groq API
llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)

# Prompt Template
prompt_template = """

Provide a clear and concise summary in 300 words of the following content:



{text}



Focus on the main points and key insights. Write in a professional tone.

"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

def get_youtube_content(url):
    """Get content from YouTube video using yt-dlp"""
    try:
        ydl_opts = {
            'format': 'worst',
            'extract_flat': True,
            'quiet': True,
            'no_warnings': True,
            'extractor_args': {
                'youtube': {
                    'skip': ['dash', 'hls'],
                }
            },
            'cookiesfrombrowser': ('chrome', ),  # Get cookies from Chrome
        }
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=False)
            
            title = info.get('title', '')
            description = info.get('description', '')
            views = info.get('view_count', 'Unknown')
            uploader = info.get('uploader', 'Unknown')
            upload_date = info.get('upload_date', 'Unknown')
            
            content = f"""

Video Title: {title}

Uploader: {uploader}

Upload Date: {upload_date}

Views: {views}



Description:

{description}

"""
            return [Document(page_content=content)]
            
    except Exception as e:
        st.error(f"Error getting YouTube content: {str(e)}")
        return None
    
def get_website_content(url):
    """Get content from website using requests and BeautifulSoup"""
    try:
        # Send request with headers to mimic a browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, verify=False)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
            
        # Get title
        title = soup.title.string if soup.title else "No title found"
        
        # Get main content (adjust selectors based on the website structure)
        main_content = ""
        
        # Try to find article content first
        article = soup.find('article')
        if article:
            main_content = article.get_text()
        else:
            # If no article tag, try common content containers
            content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content']
            for tag in content_tags:
                element = soup.select_one(tag)
                if element:
                    main_content = element.get_text()
                    break
            
            # If still no content, get all paragraph text
            if not main_content:
                paragraphs = soup.find_all('p')
                main_content = '\n'.join(p.get_text() for p in paragraphs)
        
        # Clean up the text
        # Remove extra whitespace and newlines
        main_content = re.sub(r'\s+', ' ', main_content).strip()
        # Remove any remaining HTML tags
        main_content = re.sub(r'<[^>]+>', '', main_content)
        
        content = f"""

Title: {title}

URL: {url}



Content:

{main_content}

"""
        return [Document(page_content=content)]
        
    except Exception as e:
        st.error(f"Error fetching or processing {url}, exception:\n{str(e)}")
        return None

if st.button("Summarize the Content from YT or Website"):
    # Validate Input
    if not groq_api_key or not generic_url.strip():
        st.error("Please provide a valid API key and URL.")
    elif not validators.url(generic_url):
        st.error("Please enter a valid URL (YouTube or a website).")
    else:
        try:
            with st.spinner("Fetching content and summarizing..."):
                # Load data from YouTube or Website
                if "youtube.com" in generic_url or "youtu.be" in generic_url:
                    docs = get_youtube_content(generic_url)
                else:
                    docs = get_website_content(generic_url)
                
                if docs is None:
                    st.stop()

                # Create the summary chain and run it
                chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
                output_summary = chain.run(docs)

                # Display the results
                st.success("Summary Generated Successfully!")
                
                tab1, tab2 = st.tabs(["Summary", "Raw Content"])
                
                with tab1:
                    st.write(output_summary)
                    
                with tab2:
                    if docs:
                        st.text_area("Original Content", docs[0].page_content, height=300)

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")
            st.exception(e)