File size: 6,386 Bytes
2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 2fc5eb1 8f7a009 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import os
import validators
import streamlit as st
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
import requests
from bs4 import BeautifulSoup
import re
# Load environment variables
load_dotenv()
# Streamlit App
st.set_page_config(page_title="AI Content Summarizer", page_icon="📚")
# Create two columns for the title
col1, col2 = st.columns([0.85, 0.15])
with col1:
st.title("AI Content Summarizer")
st.caption("Powered by LangChain & Gemma 🤖")
with col2:
st.image("https://python.langchain.com/img/favicon.ico", width=50)
st.markdown("""
### About This App
This application leverages the power of LangChain and Gemma AI to automatically generate concise summaries from YouTube videos and web articles. Whether you're researching a topic, catching up on content, or trying to quickly grasp key information, this tool can help save time by distilling content into clear, readable summaries.
""")
# Get API Key & URL input
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
st.error("GROQ API Key not found. Please check your environment variables.")
generic_url = st.text_input("Enter YouTube or Website URL", placeholder="https://example.com or https://youtube.com/watch?v=...")
# LangChain Model with Groq API
llm = ChatGroq(model="gemma-7b-it", groq_api_key=groq_api_key)
# Prompt Template
prompt_template = """
Provide a clear and concise summary in 300 words of the following content:
{text}
Focus on the main points and key insights. Write in a professional tone.
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
def get_youtube_id(url):
"""Extract video ID from YouTube URL"""
if 'youtube.com' in url:
query = parse_qs(urlparse(url).query)
return query.get('v', [None])[0]
elif 'youtu.be' in url:
return urlparse(url).path[1:]
return None
def get_youtube_content(url):
"""Get content from YouTube video using youtube-transcript-api"""
try:
video_id = get_youtube_id(url)
if not video_id:
raise ValueError("Could not extract YouTube video ID")
# Get transcript
transcript = YouTubeTranscriptApi.get_transcript(video_id)
transcript_text = ' '.join([entry['text'] for entry in transcript])
# Get video info using a simple request
response = requests.get(f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json")
if response.status_code == 200:
video_info = response.json()
title = video_info.get('title', 'Unknown Title')
author = video_info.get('author_name', 'Unknown Author')
else:
title = "Unknown Title"
author = "Unknown Author"
content = f"""
Video Title: {title}
Channel: {author}
Transcript:
{transcript_text}
"""
return [Document(page_content=content)]
except Exception as e:
st.error(f"Error getting YouTube content: {str(e)}")
return None
def get_website_content(url):
"""Get content from website using requests and BeautifulSoup"""
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, verify=False)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Remove unwanted elements
for element in soup(['script', 'style', 'header', 'footer', 'nav']):
element.decompose()
title = soup.title.string if soup.title else "No title found"
# Get main content with improved selection
main_content = ""
selectors = [
'article', 'main',
'[role="main"]',
'.post-content',
'.article-content',
'.content',
'#content'
]
for selector in selectors:
if soup.select_one(selector):
main_content = soup.select_one(selector).get_text()
break
if not main_content:
paragraphs = soup.find_all('p')
main_content = '\n'.join(p.get_text() for p in paragraphs)
# Clean up text
main_content = re.sub(r'\s+', ' ', main_content).strip()
main_content = re.sub(r'<[^>]+>', '', main_content)
content = f"""
Title: {title}
URL: {url}
Content:
{main_content}
"""
return [Document(page_content=content)]
except Exception as e:
st.error(f"Error processing website content: {str(e)}")
return None
if st.button("Summarize"):
if not groq_api_key or not generic_url.strip():
st.error("Please provide a valid API key and URL.")
elif not validators.url(generic_url):
st.error("Please enter a valid URL.")
else:
try:
with st.spinner("Processing content..."):
if "youtube.com" in generic_url or "youtu.be" in generic_url:
docs = get_youtube_content(generic_url)
else:
docs = get_website_content(generic_url)
if docs is None:
st.stop()
chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
output_summary = chain.run(docs)
st.success("Summary Generated!")
tab1, tab2 = st.tabs(["Summary", "Original Content"])
with tab1:
st.markdown(output_summary)
with tab2:
if docs:
st.text_area("Raw Content", docs[0].page_content, height=300)
except Exception as e:
st.error(f"An error occurred: {str(e)}") |