Spaces:

jarif
/

Simplify-YouTube-Videos-Web-Articles

Sleeping

App Files Files Community

Simplify-YouTube-Videos-Web-Articles / app.py

jarif

Upload app.py

48c4ffc verified 5 months ago

raw

history blame

6.21 kB

	import os
	import validators
	import streamlit as st
	from dotenv import load_dotenv
	from langchain.prompts import PromptTemplate
	from langchain_groq import ChatGroq
	from langchain.chains.summarize import load_summarize_chain
	from langchain.schema import Document
	import yt_dlp
	import requests
	from bs4 import BeautifulSoup
	import re

	# Load environment variables
	load_dotenv()

	# Streamlit App
	st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜")
	st.title("🦜 LangChain: Summarize Text From YT or Website")
	st.subheader("Summarize URL")

	# Get API Key & URL input
	groq_api_key = os.getenv("GROQ_API_KEY")
	if not groq_api_key:
	st.error("GROQ API Key not found. Please check your environment variables.")

	generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed")

	# LangChain Model with Groq API
	llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)

	# Prompt Template
	prompt_template = """
	Provide a clear and concise summary in 300 words of the following content:

	{text}

	Focus on the main points and key insights. Write in a professional tone.
	"""
	prompt = PromptTemplate(template=prompt_template, input_variables=["text"])

	def get_youtube_content(url):
	"""Get content from YouTube video using yt-dlp"""
	try:
	ydl_opts = {
	'format': 'worst',
	'extract_flat': True,
	'quiet': True,
	'no_warnings': True,
	'extractor_args': {
	'youtube': {
	'skip': ['dash', 'hls'],
	}
	},
	'cookiesfrombrowser': ('chrome', ), # Get cookies from Chrome
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(url, download=False)

	title = info.get('title', '')
	description = info.get('description', '')
	views = info.get('view_count', 'Unknown')
	uploader = info.get('uploader', 'Unknown')
	upload_date = info.get('upload_date', 'Unknown')

	content = f"""
	Video Title: {title}
	Uploader: {uploader}
	Upload Date: {upload_date}
	Views: {views}

	Description:
	{description}
	"""
	return [Document(page_content=content)]

	except Exception as e:
	st.error(f"Error getting YouTube content: {str(e)}")
	return None

	def get_website_content(url):
	"""Get content from website using requests and BeautifulSoup"""
	try:
	# Send request with headers to mimic a browser
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, headers=headers, verify=False)
	response.raise_for_status()

	# Parse HTML
	soup = BeautifulSoup(response.text, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	# Get title
	title = soup.title.string if soup.title else "No title found"

	# Get main content (adjust selectors based on the website structure)
	main_content = ""

	# Try to find article content first
	article = soup.find('article')
	if article:
	main_content = article.get_text()
	else:
	# If no article tag, try common content containers
	content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content']
	for tag in content_tags:
	element = soup.select_one(tag)
	if element:
	main_content = element.get_text()
	break

	# If still no content, get all paragraph text
	if not main_content:
	paragraphs = soup.find_all('p')
	main_content = '\n'.join(p.get_text() for p in paragraphs)

	# Clean up the text
	# Remove extra whitespace and newlines
	main_content = re.sub(r'\s+', ' ', main_content).strip()
	# Remove any remaining HTML tags
	main_content = re.sub(r'<[^>]+>', '', main_content)

	content = f"""
	Title: {title}
	URL: {url}

	Content:
	{main_content}
	"""
	return [Document(page_content=content)]

	except Exception as e:
	st.error(f"Error fetching or processing {url}, exception:\n{str(e)}")
	return None

	if st.button("Summarize the Content from YT or Website"):
	# Validate Input
	if not groq_api_key or not generic_url.strip():
	st.error("Please provide a valid API key and URL.")
	elif not validators.url(generic_url):
	st.error("Please enter a valid URL (YouTube or a website).")
	else:
	try:
	with st.spinner("Fetching content and summarizing..."):
	# Load data from YouTube or Website
	if "youtube.com" in generic_url or "youtu.be" in generic_url:
	docs = get_youtube_content(generic_url)
	else:
	docs = get_website_content(generic_url)

	if docs is None:
	st.stop()

	# Create the summary chain and run it
	chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
	output_summary = chain.run(docs)

	# Display the results
	st.success("Summary Generated Successfully!")

	tab1, tab2 = st.tabs(["Summary", "Raw Content"])

	with tab1:
	st.write(output_summary)

	with tab2:
	if docs:
	st.text_area("Original Content", docs[0].page_content, height=300)

	except Exception as e:
	st.error(f"An error occurred: {str(e)}")
	st.exception(e)