import os
import validators
import streamlit as st
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
import yt_dlp
import requests
from bs4 import BeautifulSoup
import re

# Load environment variables
load_dotenv()

# Streamlit App
st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜")
st.title("🦜 LangChain: Summarize Text From YT or Website")
st.subheader("Summarize URL")

# Get API Key & URL input
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
    st.error("GROQ API Key not found. Please check your environment variables.")

generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed")

# LangChain Model with Groq API
llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)

# Prompt Template
prompt_template = """
Provide a clear and concise summary in 300 words of the following content:

{text}

Focus on the main points and key insights. Write in a professional tone.
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

def get_youtube_content(url):
    """Get content from YouTube video using yt-dlp"""
    try:
        ydl_opts = {
            'format': 'worst',
            'extract_flat': True,
            'quiet': True,
            'no_warnings': True,
            'extractor_args': {
                'youtube': {
                    'skip': ['dash', 'hls'],
                }
            },
            'cookiesfrombrowser': ('chrome', ),  # Get cookies from Chrome
        }
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=False)
            
            title = info.get('title', '')
            description = info.get('description', '')
            views = info.get('view_count', 'Unknown')
            uploader = info.get('uploader', 'Unknown')
            upload_date = info.get('upload_date', 'Unknown')
            
            content = f"""
Video Title: {title}
Uploader: {uploader}
Upload Date: {upload_date}
Views: {views}

Description:
{description}
"""
            return [Document(page_content=content)]
            
    except Exception as e:
        st.error(f"Error getting YouTube content: {str(e)}")
        return None
    
def get_website_content(url):
    """Get content from website using requests and BeautifulSoup"""
    try:
        # Send request with headers to mimic a browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, verify=False)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
            
        # Get title
        title = soup.title.string if soup.title else "No title found"
        
        # Get main content (adjust selectors based on the website structure)
        main_content = ""
        
        # Try to find article content first
        article = soup.find('article')
        if article:
            main_content = article.get_text()
        else:
            # If no article tag, try common content containers
            content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content']
            for tag in content_tags:
                element = soup.select_one(tag)
                if element:
                    main_content = element.get_text()
                    break
            
            # If still no content, get all paragraph text
            if not main_content:
                paragraphs = soup.find_all('p')
                main_content = '\n'.join(p.get_text() for p in paragraphs)
        
        # Clean up the text
        # Remove extra whitespace and newlines
        main_content = re.sub(r'\s+', ' ', main_content).strip()
        # Remove any remaining HTML tags
        main_content = re.sub(r'<[^>]+>', '', main_content)
        
        content = f"""
Title: {title}
URL: {url}

Content:
{main_content}
"""
        return [Document(page_content=content)]
        
    except Exception as e:
        st.error(f"Error fetching or processing {url}, exception:\n{str(e)}")
        return None

if st.button("Summarize the Content from YT or Website"):
    # Validate Input
    if not groq_api_key or not generic_url.strip():
        st.error("Please provide a valid API key and URL.")
    elif not validators.url(generic_url):
        st.error("Please enter a valid URL (YouTube or a website).")
    else:
        try:
            with st.spinner("Fetching content and summarizing..."):
                # Load data from YouTube or Website
                if "youtube.com" in generic_url or "youtu.be" in generic_url:
                    docs = get_youtube_content(generic_url)
                else:
                    docs = get_website_content(generic_url)
                
                if docs is None:
                    st.stop()

                # Create the summary chain and run it
                chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
                output_summary = chain.run(docs)

                # Display the results
                st.success("Summary Generated Successfully!")
                
                tab1, tab2 = st.tabs(["Summary", "Raw Content"])
                
                with tab1:
                    st.write(output_summary)
                    
                with tab2:
                    if docs:
                        st.text_area("Original Content", docs[0].page_content, height=300)

        except Exception as e:
            st.error(f"An error occurred: {str(e)}")
            st.exception(e)