Spaces:

jarif
/

Simplify-YouTube-Videos-Web-Articles

Sleeping

File size: 6,386 Bytes

2fc5eb1
 
 
 
 
 
 
 
8f7a009
 
2fc5eb1
 
 
 
 
 
 
 
8f7a009
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fc5eb1
 
 
 
 
 
8f7a009
2fc5eb1
 
8f7a009
2fc5eb1
 
 
 
 
 
 
 
 
 
 
8f7a009
 
 
 
 
 
 
 
 
2fc5eb1
8f7a009
2fc5eb1
8f7a009
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2fc5eb1
8f7a009
 
 
 
 
2fc5eb1
8f7a009
 
2fc5eb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f7a009
 
 
2fc5eb1
 
 
8f7a009
2fc5eb1
8f7a009
 
 
 
 
 
 
 
2fc5eb1
8f7a009
 
 
 
 
 
 
 
2fc5eb1
8f7a009
2fc5eb1
 
 
 
 
 
 
 
 
 
 
 
 
8f7a009
2fc5eb1
 
8f7a009
2fc5eb1
 
 
8f7a009
2fc5eb1
 
8f7a009
2fc5eb1
 
 
 
 
 
 
 
 
 
 
8f7a009
2fc5eb1
8f7a009
2fc5eb1
 
8f7a009
2fc5eb1
 
 
8f7a009
2fc5eb1
 
8f7a009

import os
import validators
import streamlit as st
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
import requests
from bs4 import BeautifulSoup
import re

# Load environment variables
load_dotenv()

# Streamlit App
st.set_page_config(page_title="AI Content Summarizer", page_icon="📚")

# Create two columns for the title
col1, col2 = st.columns([0.85, 0.15])
with col1:
    st.title("AI Content Summarizer")
    st.caption("Powered by LangChain & Gemma 🤖")
    
with col2:
    st.image("https://python.langchain.com/img/favicon.ico", width=50)

st.markdown("""

### About This App

This application leverages the power of LangChain and Gemma AI to automatically generate concise summaries from YouTube videos and web articles. Whether you're researching a topic, catching up on content, or trying to quickly grasp key information, this tool can help save time by distilling content into clear, readable summaries.

""")

# Get API Key & URL input
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    st.error("GROQ API Key not found. Please check your environment variables.")

generic_url = st.text_input("Enter YouTube or Website URL", placeholder="https://example.com or https://youtube.com/watch?v=...")

# LangChain Model with Groq API
llm = ChatGroq(model="gemma-7b-it", groq_api_key=groq_api_key)

# Prompt Template
prompt_template = """

Provide a clear and concise summary in 300 words of the following content:



{text}



Focus on the main points and key insights. Write in a professional tone.

"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

def get_youtube_id(url):
    """Extract video ID from YouTube URL"""
    if 'youtube.com' in url:
        query = parse_qs(urlparse(url).query)
        return query.get('v', [None])[0]
    elif 'youtu.be' in url:
        return urlparse(url).path[1:]
    return None

def get_youtube_content(url):
    """Get content from YouTube video using youtube-transcript-api"""
    try:
        video_id = get_youtube_id(url)
        if not video_id:
            raise ValueError("Could not extract YouTube video ID")

        # Get transcript
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcript_text = ' '.join([entry['text'] for entry in transcript])

        # Get video info using a simple request
        response = requests.get(f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json")
        if response.status_code == 200:
            video_info = response.json()
            title = video_info.get('title', 'Unknown Title')
            author = video_info.get('author_name', 'Unknown Author')
        else:
            title = "Unknown Title"
            author = "Unknown Author"

        content = f"""

Video Title: {title}

Channel: {author}

Transcript:

{transcript_text}

"""
        return [Document(page_content=content)]

    except Exception as e:
        st.error(f"Error getting YouTube content: {str(e)}")
        return None

def get_website_content(url):
    """Get content from website using requests and BeautifulSoup"""
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, verify=False)
        response.raise_for_status()
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove unwanted elements
        for element in soup(['script', 'style', 'header', 'footer', 'nav']):
            element.decompose()
            
        title = soup.title.string if soup.title else "No title found"
        
        # Get main content with improved selection
        main_content = ""
        selectors = [
            'article', 'main', 
            '[role="main"]', 
            '.post-content', 
            '.article-content', 
            '.content',
            '#content'
        ]
        
        for selector in selectors:
            if soup.select_one(selector):
                main_content = soup.select_one(selector).get_text()
                break
        
        if not main_content:
            paragraphs = soup.find_all('p')
            main_content = '\n'.join(p.get_text() for p in paragraphs)
        
        # Clean up text
        main_content = re.sub(r'\s+', ' ', main_content).strip()
        main_content = re.sub(r'<[^>]+>', '', main_content)
        
        content = f"""

Title: {title}

URL: {url}



Content:

{main_content}

"""
        return [Document(page_content=content)]
        
    except Exception as e:
        st.error(f"Error processing website content: {str(e)}")
        return None

if st.button("Summarize"):
    if not groq_api_key or not generic_url.strip():
        st.error("Please provide a valid API key and URL.")
    elif not validators.url(generic_url):
        st.error("Please enter a valid URL.")
    else:
        try:
            with st.spinner("Processing content..."):
                if "youtube.com" in generic_url or "youtu.be" in generic_url:
                    docs = get_youtube_content(generic_url)
                else:
                    docs = get_website_content(generic_url)
                
                if docs is None:
                    st.stop()

                chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
                output_summary = chain.run(docs)

                st.success("Summary Generated!")
                
                tab1, tab2 = st.tabs(["Summary", "Original Content"])
                
                with tab1:
                    st.markdown(output_summary)
                    
                with tab2:
                    if docs:
                        st.text_area("Raw Content", docs[0].page_content, height=300)

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")