File size: 8,109 Bytes
2fc5eb1 f8b6c23 2fc5eb1 1690c69 09892c6 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 28ad143 2fc5eb1 28ad143 6c43c08 28ad143 6c43c08 28ad143 6c43c08 8f7a009 f8b6c23 8f7a009 6c43c08 f8b6c23 6c43c08 2fc5eb1 1690c69 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 2fc5eb1 f8b6c23 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 |
import os
import validators
import streamlit as st
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
import yt_dlp
import requests
from bs4 import BeautifulSoup
import re
import json
try:
with open('youtube.json', 'r') as f:
cookies = json.load(f)
cookie_content = """# Netscape HTTP Cookie File
# https://curl.haxx.se/docs/http-cookies.html
# This file is generated by yt-dlp! Edit at your own risk.
"""
for cookie in cookies:
domain = cookie.get('domain', '')
if not domain.startswith('.'): # Ensure domain starts with a dot
domain = '.' + domain
path = cookie.get('path', '/')
secure = "TRUE" if cookie.get('secure', False) else "FALSE"
expires = str(int(cookie.get('expirationDate', 2147483647)))
name = cookie.get('name', '')
value = cookie.get('value', '')
if domain and name and value:
cookie_line = f"{domain}\tTRUE\t{path}\t{secure}\t{expires}\t{name}\t{value}\n"
cookie_content += cookie_line
with open('youtube_cookies.txt', 'w', encoding='utf-8') as f:
f.write(cookie_content)
except Exception as e:
print(f"Error processing cookies: {e}")
# Load environment variables
load_dotenv()
# Streamlit App
st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜")
st.title("🦜 LangChain: Summarize Text From YT or Website")
st.subheader("Summarize URL")
# Get API Key & URL input
groq_api_key = os.getenv("GROQ_API_KEY")
if not groq_api_key:
st.error("GROQ API Key not found. Please check your environment variables.")
generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed")
# LangChain Model with Groq API
llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)
# Prompt Template
prompt_template = """
Provide a clear and concise summary in 300 words of the following content:
{text}
Focus on the main points and key insights. Write in a professional tone.
"""
prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
def get_youtube_content(url):
"""Get content from YouTube video"""
try:
# First try youtube-transcript-api
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
# Extract video ID from URL
if 'youtube.com' in url:
video_id = parse_qs(urlparse(url).query)['v'][0]
elif 'youtu.be' in url:
video_id = urlparse(url).path[1:]
else:
raise ValueError("Not a valid YouTube URL")
try:
# Try getting transcript
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
transcript_text = ' '.join([entry['text'] for entry in transcript_list])
except:
# Fallback to yt-dlp for description if transcript fails
ydl_opts = {
'quiet': True,
'no_warnings': True,
'extract_flat': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
try:
video_info = ydl.extract_info(url, download=False)
transcript_text = video_info.get('description', 'No description available')
except:
transcript_text = "Could not extract video content."
# Get video info
response = requests.get(f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json")
if response.status_code == 200:
video_info = response.json()
title = video_info.get('title', '')
uploader = video_info.get('author_name', '')
else:
title = "Unknown Title"
uploader = "Unknown Uploader"
content = f"""
Video Title: {title}
Uploader: {uploader}
Content:
{transcript_text}
"""
return [Document(page_content=content)]
except Exception as e:
st.error(f"Error getting YouTube content: {str(e)}")
return None
def get_website_content(url):
"""Get content from website using requests and BeautifulSoup"""
try:
# Send request with headers to mimic a browser
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, verify=False)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.text, 'html.parser')
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Get title
title = soup.title.string if soup.title else "No title found"
# Get main content (adjust selectors based on the website structure)
main_content = ""
# Try to find article content first
article = soup.find('article')
if article:
main_content = article.get_text()
else:
# If no article tag, try common content containers
content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content']
for tag in content_tags:
element = soup.select_one(tag)
if element:
main_content = element.get_text()
break
# If still no content, get all paragraph text
if not main_content:
paragraphs = soup.find_all('p')
main_content = '\n'.join(p.get_text() for p in paragraphs)
# Clean up the text
# Remove extra whitespace and newlines
main_content = re.sub(r'\s+', ' ', main_content).strip()
# Remove any remaining HTML tags
main_content = re.sub(r'<[^>]+>', '', main_content)
content = f"""
Title: {title}
URL: {url}
Content:
{main_content}
"""
return [Document(page_content=content)]
except Exception as e:
st.error(f"Error fetching or processing {url}, exception:\n{str(e)}")
return None
if st.button("Summarize the Content from YT or Website"):
# Validate Input
if not groq_api_key or not generic_url.strip():
st.error("Please provide a valid API key and URL.")
elif not validators.url(generic_url):
st.error("Please enter a valid URL (YouTube or a website).")
else:
try:
with st.spinner("Fetching content and summarizing..."):
# Load data from YouTube or Website
if "youtube.com" in generic_url or "youtu.be" in generic_url:
docs = get_youtube_content(generic_url)
else:
docs = get_website_content(generic_url)
if docs is None:
st.stop()
# Create the summary chain and run it
chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
output_summary = chain.run(docs)
# Display the results
st.success("Summary Generated Successfully!")
tab1, tab2 = st.tabs(["Summary", "Raw Content"])
with tab1:
st.write(output_summary)
with tab2:
if docs:
st.text_area("Original Content", docs[0].page_content, height=300)
except Exception as e:
st.error(f"An error occurred: {str(e)}")
st.exception(e) |