import os import validators import streamlit as st from dotenv import load_dotenv from langchain.prompts import PromptTemplate from langchain_groq import ChatGroq from langchain.chains.summarize import load_summarize_chain from langchain.schema import Document from youtube_transcript_api import YouTubeTranscriptApi from urllib.parse import urlparse, parse_qs import requests from bs4 import BeautifulSoup import re # Load environment variables load_dotenv() # Streamlit App st.set_page_config(page_title="AI Content Summarizer", page_icon="📚") # Create two columns for the title col1, col2 = st.columns([0.85, 0.15]) with col1: st.title("AI Content Summarizer") st.caption("Powered by LangChain & Gemma 🤖") with col2: st.image("https://python.langchain.com/img/favicon.ico", width=50) st.markdown(""" ### About This App This application leverages the power of LangChain and Gemma AI to automatically generate concise summaries from YouTube videos and web articles. Whether you're researching a topic, catching up on content, or trying to quickly grasp key information, this tool can help save time by distilling content into clear, readable summaries. """) # Get API Key & URL input groq_api_key = os.getenv("GROQ_API_KEY") if not groq_api_key: st.error("GROQ API Key not found. Please check your environment variables.") generic_url = st.text_input("Enter YouTube or Website URL", placeholder="https://example.com or https://youtube.com/watch?v=...") # LangChain Model with Groq API llm = ChatGroq(model="gemma-7b-it", groq_api_key=groq_api_key) # Prompt Template prompt_template = """ Provide a clear and concise summary in 300 words of the following content: {text} Focus on the main points and key insights. Write in a professional tone. """ prompt = PromptTemplate(template=prompt_template, input_variables=["text"]) def get_youtube_id(url): """Extract video ID from YouTube URL""" if 'youtube.com' in url: query = parse_qs(urlparse(url).query) return query.get('v', [None])[0] elif 'youtu.be' in url: return urlparse(url).path[1:] return None def get_youtube_content(url): """Get content from YouTube video using youtube-transcript-api""" try: video_id = get_youtube_id(url) if not video_id: raise ValueError("Could not extract YouTube video ID") # Get transcript transcript = YouTubeTranscriptApi.get_transcript(video_id) transcript_text = ' '.join([entry['text'] for entry in transcript]) # Get video info using a simple request response = requests.get(f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json") if response.status_code == 200: video_info = response.json() title = video_info.get('title', 'Unknown Title') author = video_info.get('author_name', 'Unknown Author') else: title = "Unknown Title" author = "Unknown Author" content = f""" Video Title: {title} Channel: {author} Transcript: {transcript_text} """ return [Document(page_content=content)] except Exception as e: st.error(f"Error getting YouTube content: {str(e)}") return None def get_website_content(url): """Get content from website using requests and BeautifulSoup""" try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, verify=False) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Remove unwanted elements for element in soup(['script', 'style', 'header', 'footer', 'nav']): element.decompose() title = soup.title.string if soup.title else "No title found" # Get main content with improved selection main_content = "" selectors = [ 'article', 'main', '[role="main"]', '.post-content', '.article-content', '.content', '#content' ] for selector in selectors: if soup.select_one(selector): main_content = soup.select_one(selector).get_text() break if not main_content: paragraphs = soup.find_all('p') main_content = '\n'.join(p.get_text() for p in paragraphs) # Clean up text main_content = re.sub(r'\s+', ' ', main_content).strip() main_content = re.sub(r'<[^>]+>', '', main_content) content = f""" Title: {title} URL: {url} Content: {main_content} """ return [Document(page_content=content)] except Exception as e: st.error(f"Error processing website content: {str(e)}") return None if st.button("Summarize"): if not groq_api_key or not generic_url.strip(): st.error("Please provide a valid API key and URL.") elif not validators.url(generic_url): st.error("Please enter a valid URL.") else: try: with st.spinner("Processing content..."): if "youtube.com" in generic_url or "youtu.be" in generic_url: docs = get_youtube_content(generic_url) else: docs = get_website_content(generic_url) if docs is None: st.stop() chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt) output_summary = chain.run(docs) st.success("Summary Generated!") tab1, tab2 = st.tabs(["Summary", "Original Content"]) with tab1: st.markdown(output_summary) with tab2: if docs: st.text_area("Raw Content", docs[0].page_content, height=300) except Exception as e: st.error(f"An error occurred: {str(e)}")