jarif's picture
Upload app.py
48c4ffc verified
raw
history blame
6.21 kB
import os
import validators
import streamlit as st
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
import yt_dlp
import requests
from bs4 import BeautifulSoup
import re
# Load environment variables
load_dotenv()
# Streamlit App
st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜")
st.title("🦜 LangChain: Summarize Text From YT or Website")
st.subheader("Summarize URL")
# Get API Key & URL input
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
st.error("GROQ API Key not found. Please check your environment variables.")
generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed")
# LangChain Model with Groq API
llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)
# Prompt Template
prompt_template = """
Provide a clear and concise summary in 300 words of the following content:
{text}
Focus on the main points and key insights. Write in a professional tone.
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
def get_youtube_content(url):
"""Get content from YouTube video using yt-dlp"""
try:
ydl_opts = {
'format': 'worst',
'extract_flat': True,
'quiet': True,
'no_warnings': True,
'extractor_args': {
'youtube': {
'skip': ['dash', 'hls'],
}
},
'cookiesfrombrowser': ('chrome', ), # Get cookies from Chrome
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=False)
title = info.get('title', '')
description = info.get('description', '')
views = info.get('view_count', 'Unknown')
uploader = info.get('uploader', 'Unknown')
upload_date = info.get('upload_date', 'Unknown')
content = f"""
Video Title: {title}
Uploader: {uploader}
Upload Date: {upload_date}
Views: {views}
Description:
{description}
"""
return [Document(page_content=content)]
except Exception as e:
st.error(f"Error getting YouTube content: {str(e)}")
return None
def get_website_content(url):
"""Get content from website using requests and BeautifulSoup"""
try:
# Send request with headers to mimic a browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, verify=False)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get title
title = soup.title.string if soup.title else "No title found"
# Get main content (adjust selectors based on the website structure)
main_content = ""
# Try to find article content first
article = soup.find('article')
if article:
main_content = article.get_text()
else:
# If no article tag, try common content containers
content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content']
for tag in content_tags:
element = soup.select_one(tag)
if element:
main_content = element.get_text()
break
# If still no content, get all paragraph text
if not main_content:
paragraphs = soup.find_all('p')
main_content = '\n'.join(p.get_text() for p in paragraphs)
# Clean up the text
# Remove extra whitespace and newlines
main_content = re.sub(r'\s+', ' ', main_content).strip()
# Remove any remaining HTML tags
main_content = re.sub(r'<[^>]+>', '', main_content)
content = f"""
Title: {title}
URL: {url}
Content:
{main_content}
"""
return [Document(page_content=content)]
except Exception as e:
st.error(f"Error fetching or processing {url}, exception:\n{str(e)}")
return None
if st.button("Summarize the Content from YT or Website"):
# Validate Input
if not groq_api_key or not generic_url.strip():
st.error("Please provide a valid API key and URL.")
elif not validators.url(generic_url):
st.error("Please enter a valid URL (YouTube or a website).")
else:
try:
with st.spinner("Fetching content and summarizing..."):
# Load data from YouTube or Website
if "youtube.com" in generic_url or "youtu.be" in generic_url:
docs = get_youtube_content(generic_url)
else:
docs = get_website_content(generic_url)
if docs is None:
st.stop()
# Create the summary chain and run it
chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
output_summary = chain.run(docs)
# Display the results
st.success("Summary Generated Successfully!")
tab1, tab2 = st.tabs(["Summary", "Raw Content"])
with tab1:
st.write(output_summary)
with tab2:
if docs:
st.text_area("Original Content", docs[0].page_content, height=300)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
st.exception(e)