import os import validators import streamlit as st from dotenv import load_dotenv from langchain.prompts import PromptTemplate from langchain_groq import ChatGroq from langchain.chains.summarize import load_summarize_chain from langchain.schema import Document import yt_dlp import requests from bs4 import BeautifulSoup import re # Load environment variables load_dotenv() # Streamlit App st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜") st.title("🦜 LangChain: Summarize Text From YT or Website") st.subheader("Summarize URL") # Get API Key & URL input groq_api_key = os.getenv("GROQ_API_KEY") if not groq_api_key: st.error("GROQ API Key not found. Please check your environment variables.") generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed") # LangChain Model with Groq API llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key) # Prompt Template prompt_template = """ Provide a clear and concise summary in 300 words of the following content: {text} Focus on the main points and key insights. Write in a professional tone. """ prompt = PromptTemplate(template=prompt_template, input_variables=["text"]) def get_youtube_content(url): """Get content from YouTube video using yt-dlp""" try: ydl_opts = { 'format': 'worst', 'extract_flat': True, 'quiet': True, 'no_warnings': True, 'extractor_args': { 'youtube': { 'skip': ['dash', 'hls'], } }, 'cookiesfrombrowser': ('chrome', ), # Get cookies from Chrome } with yt_dlp.YoutubeDL(ydl_opts) as ydl: info = ydl.extract_info(url, download=False) title = info.get('title', '') description = info.get('description', '') views = info.get('view_count', 'Unknown') uploader = info.get('uploader', 'Unknown') upload_date = info.get('upload_date', 'Unknown') content = f""" Video Title: {title} Uploader: {uploader} Upload Date: {upload_date} Views: {views} Description: {description} """ return [Document(page_content=content)] except Exception as e: st.error(f"Error getting YouTube content: {str(e)}") return None def get_website_content(url): """Get content from website using requests and BeautifulSoup""" try: # Send request with headers to mimic a browser headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, verify=False) response.raise_for_status() # Parse HTML soup = BeautifulSoup(response.text, 'html.parser') # Remove script and style elements for script in soup(["script", "style"]): script.decompose() # Get title title = soup.title.string if soup.title else "No title found" # Get main content (adjust selectors based on the website structure) main_content = "" # Try to find article content first article = soup.find('article') if article: main_content = article.get_text() else: # If no article tag, try common content containers content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content'] for tag in content_tags: element = soup.select_one(tag) if element: main_content = element.get_text() break # If still no content, get all paragraph text if not main_content: paragraphs = soup.find_all('p') main_content = '\n'.join(p.get_text() for p in paragraphs) # Clean up the text # Remove extra whitespace and newlines main_content = re.sub(r'\s+', ' ', main_content).strip() # Remove any remaining HTML tags main_content = re.sub(r'<[^>]+>', '', main_content) content = f""" Title: {title} URL: {url} Content: {main_content} """ return [Document(page_content=content)] except Exception as e: st.error(f"Error fetching or processing {url}, exception:\n{str(e)}") return None if st.button("Summarize the Content from YT or Website"): # Validate Input if not groq_api_key or not generic_url.strip(): st.error("Please provide a valid API key and URL.") elif not validators.url(generic_url): st.error("Please enter a valid URL (YouTube or a website).") else: try: with st.spinner("Fetching content and summarizing..."): # Load data from YouTube or Website if "youtube.com" in generic_url or "youtu.be" in generic_url: docs = get_youtube_content(generic_url) else: docs = get_website_content(generic_url) if docs is None: st.stop() # Create the summary chain and run it chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt) output_summary = chain.run(docs) # Display the results st.success("Summary Generated Successfully!") tab1, tab2 = st.tabs(["Summary", "Raw Content"]) with tab1: st.write(output_summary) with tab2: if docs: st.text_area("Original Content", docs[0].page_content, height=300) except Exception as e: st.error(f"An error occurred: {str(e)}") st.exception(e)