jarif commited on
Commit
f8b6c23
·
verified ·
1 Parent(s): 5f17154

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -83
app.py CHANGED
@@ -6,8 +6,7 @@ from langchain.prompts import PromptTemplate
6
  from langchain_groq import ChatGroq
7
  from langchain.chains.summarize import load_summarize_chain
8
  from langchain.schema import Document
9
- from youtube_transcript_api import YouTubeTranscriptApi
10
- from urllib.parse import urlparse, parse_qs
11
  import requests
12
  from bs4 import BeautifulSoup
13
  import re
@@ -16,31 +15,19 @@ import re
16
  load_dotenv()
17
 
18
  # Streamlit App
19
- st.set_page_config(page_title="AI Content Summarizer", page_icon="📚")
20
-
21
- # Create two columns for the title
22
- col1, col2 = st.columns([0.85, 0.15])
23
- with col1:
24
- st.title("AI Content Summarizer")
25
- st.caption("Powered by LangChain & Gemma 🤖")
26
-
27
- with col2:
28
- st.image("https://python.langchain.com/img/favicon.ico", width=50)
29
-
30
- st.markdown("""
31
- ### About This App
32
- This application leverages the power of LangChain and Gemma AI to automatically generate concise summaries from YouTube videos and web articles. Whether you're researching a topic, catching up on content, or trying to quickly grasp key information, this tool can help save time by distilling content into clear, readable summaries.
33
- """)
34
 
35
  # Get API Key & URL input
36
  groq_api_key = os.getenv("GROQ_API_KEY")
37
  if not groq_api_key:
38
  st.error("GROQ API Key not found. Please check your environment variables.")
39
 
40
- generic_url = st.text_input("Enter YouTube or Website URL", placeholder="https://example.com or https://youtube.com/watch?v=...")
41
 
42
  # LangChain Model with Groq API
43
- llm = ChatGroq(model="gemma-7b-it", groq_api_key=groq_api_key)
44
 
45
  # Prompt Template
46
  prompt_template = """
@@ -52,44 +39,36 @@ Focus on the main points and key insights. Write in a professional tone.
52
  """
53
  prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
54
 
55
- def get_youtube_id(url):
56
- """Extract video ID from YouTube URL"""
57
- if 'youtube.com' in url:
58
- query = parse_qs(urlparse(url).query)
59
- return query.get('v', [None])[0]
60
- elif 'youtu.be' in url:
61
- return urlparse(url).path[1:]
62
- return None
63
-
64
  def get_youtube_content(url):
65
- """Get content from YouTube video using youtube-transcript-api"""
66
  try:
67
- video_id = get_youtube_id(url)
68
- if not video_id:
69
- raise ValueError("Could not extract YouTube video ID")
70
-
71
- # Get transcript
72
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
73
- transcript_text = ' '.join([entry['text'] for entry in transcript])
74
-
75
- # Get video info using a simple request
76
- response = requests.get(f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json")
77
- if response.status_code == 200:
78
- video_info = response.json()
79
- title = video_info.get('title', 'Unknown Title')
80
- author = video_info.get('author_name', 'Unknown Author')
81
- else:
82
- title = "Unknown Title"
83
- author = "Unknown Author"
84
-
85
- content = f"""
86
  Video Title: {title}
87
- Channel: {author}
88
- Transcript:
89
- {transcript_text}
90
- """
91
- return [Document(page_content=content)]
92
 
 
 
 
 
 
93
  except Exception as e:
94
  st.error(f"Error getting YouTube content: {str(e)}")
95
  return None
@@ -97,42 +76,48 @@ Transcript:
97
  def get_website_content(url):
98
  """Get content from website using requests and BeautifulSoup"""
99
  try:
 
100
  headers = {
101
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
102
  }
103
  response = requests.get(url, headers=headers, verify=False)
104
  response.raise_for_status()
105
 
 
106
  soup = BeautifulSoup(response.text, 'html.parser')
107
 
108
- # Remove unwanted elements
109
- for element in soup(['script', 'style', 'header', 'footer', 'nav']):
110
- element.decompose()
111
 
 
112
  title = soup.title.string if soup.title else "No title found"
113
 
114
- # Get main content with improved selection
115
  main_content = ""
116
- selectors = [
117
- 'article', 'main',
118
- '[role="main"]',
119
- '.post-content',
120
- '.article-content',
121
- '.content',
122
- '#content'
123
- ]
124
 
125
- for selector in selectors:
126
- if soup.select_one(selector):
127
- main_content = soup.select_one(selector).get_text()
128
- break
129
-
130
- if not main_content:
131
- paragraphs = soup.find_all('p')
132
- main_content = '\n'.join(p.get_text() for p in paragraphs)
 
 
 
 
 
 
 
 
 
133
 
134
- # Clean up text
 
135
  main_content = re.sub(r'\s+', ' ', main_content).strip()
 
136
  main_content = re.sub(r'<[^>]+>', '', main_content)
137
 
138
  content = f"""
@@ -145,17 +130,19 @@ Content:
145
  return [Document(page_content=content)]
146
 
147
  except Exception as e:
148
- st.error(f"Error processing website content: {str(e)}")
149
  return None
150
 
151
- if st.button("Summarize"):
 
152
  if not groq_api_key or not generic_url.strip():
153
  st.error("Please provide a valid API key and URL.")
154
  elif not validators.url(generic_url):
155
- st.error("Please enter a valid URL.")
156
  else:
157
  try:
158
- with st.spinner("Processing content..."):
 
159
  if "youtube.com" in generic_url or "youtu.be" in generic_url:
160
  docs = get_youtube_content(generic_url)
161
  else:
@@ -164,19 +151,22 @@ if st.button("Summarize"):
164
  if docs is None:
165
  st.stop()
166
 
 
167
  chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
168
  output_summary = chain.run(docs)
169
 
170
- st.success("Summary Generated!")
 
171
 
172
- tab1, tab2 = st.tabs(["Summary", "Original Content"])
173
 
174
  with tab1:
175
- st.markdown(output_summary)
176
 
177
  with tab2:
178
  if docs:
179
- st.text_area("Raw Content", docs[0].page_content, height=300)
180
 
181
  except Exception as e:
182
- st.error(f"An error occurred: {str(e)}")
 
 
6
  from langchain_groq import ChatGroq
7
  from langchain.chains.summarize import load_summarize_chain
8
  from langchain.schema import Document
9
+ import yt_dlp
 
10
  import requests
11
  from bs4 import BeautifulSoup
12
  import re
 
15
  load_dotenv()
16
 
17
  # Streamlit App
18
+ st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜")
19
+ st.title("🦜 LangChain: Summarize Text From YT or Website")
20
+ st.subheader("Summarize URL")
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Get API Key & URL input
23
  groq_api_key = os.getenv("GROQ_API_KEY")
24
  if not groq_api_key:
25
  st.error("GROQ API Key not found. Please check your environment variables.")
26
 
27
+ generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed")
28
 
29
  # LangChain Model with Groq API
30
+ llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)
31
 
32
  # Prompt Template
33
  prompt_template = """
 
39
  """
40
  prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
41
 
 
 
 
 
 
 
 
 
 
42
  def get_youtube_content(url):
43
+ """Get content from YouTube video using yt-dlp"""
44
  try:
45
+ ydl_opts = {
46
+ 'format': 'worst',
47
+ 'extract_flat': True,
48
+ 'quiet': True,
49
+ 'no_warnings': True
50
+ }
51
+
52
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
53
+ info = ydl.extract_info(url, download=False)
54
+
55
+ title = info.get('title', '')
56
+ description = info.get('description', '')
57
+ views = info.get('view_count', 'Unknown')
58
+ uploader = info.get('uploader', 'Unknown')
59
+ upload_date = info.get('upload_date', 'Unknown')
60
+
61
+ content = f"""
 
 
62
  Video Title: {title}
63
+ Uploader: {uploader}
64
+ Upload Date: {upload_date}
65
+ Views: {views}
 
 
66
 
67
+ Description:
68
+ {description}
69
+ """
70
+ return [Document(page_content=content)]
71
+
72
  except Exception as e:
73
  st.error(f"Error getting YouTube content: {str(e)}")
74
  return None
 
76
  def get_website_content(url):
77
  """Get content from website using requests and BeautifulSoup"""
78
  try:
79
+ # Send request with headers to mimic a browser
80
  headers = {
81
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
82
  }
83
  response = requests.get(url, headers=headers, verify=False)
84
  response.raise_for_status()
85
 
86
+ # Parse HTML
87
  soup = BeautifulSoup(response.text, 'html.parser')
88
 
89
+ # Remove script and style elements
90
+ for script in soup(["script", "style"]):
91
+ script.decompose()
92
 
93
+ # Get title
94
  title = soup.title.string if soup.title else "No title found"
95
 
96
+ # Get main content (adjust selectors based on the website structure)
97
  main_content = ""
 
 
 
 
 
 
 
 
98
 
99
+ # Try to find article content first
100
+ article = soup.find('article')
101
+ if article:
102
+ main_content = article.get_text()
103
+ else:
104
+ # If no article tag, try common content containers
105
+ content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content']
106
+ for tag in content_tags:
107
+ element = soup.select_one(tag)
108
+ if element:
109
+ main_content = element.get_text()
110
+ break
111
+
112
+ # If still no content, get all paragraph text
113
+ if not main_content:
114
+ paragraphs = soup.find_all('p')
115
+ main_content = '\n'.join(p.get_text() for p in paragraphs)
116
 
117
+ # Clean up the text
118
+ # Remove extra whitespace and newlines
119
  main_content = re.sub(r'\s+', ' ', main_content).strip()
120
+ # Remove any remaining HTML tags
121
  main_content = re.sub(r'<[^>]+>', '', main_content)
122
 
123
  content = f"""
 
130
  return [Document(page_content=content)]
131
 
132
  except Exception as e:
133
+ st.error(f"Error fetching or processing {url}, exception:\n{str(e)}")
134
  return None
135
 
136
+ if st.button("Summarize the Content from YT or Website"):
137
+ # Validate Input
138
  if not groq_api_key or not generic_url.strip():
139
  st.error("Please provide a valid API key and URL.")
140
  elif not validators.url(generic_url):
141
+ st.error("Please enter a valid URL (YouTube or a website).")
142
  else:
143
  try:
144
+ with st.spinner("Fetching content and summarizing..."):
145
+ # Load data from YouTube or Website
146
  if "youtube.com" in generic_url or "youtu.be" in generic_url:
147
  docs = get_youtube_content(generic_url)
148
  else:
 
151
  if docs is None:
152
  st.stop()
153
 
154
+ # Create the summary chain and run it
155
  chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
156
  output_summary = chain.run(docs)
157
 
158
+ # Display the results
159
+ st.success("Summary Generated Successfully!")
160
 
161
+ tab1, tab2 = st.tabs(["Summary", "Raw Content"])
162
 
163
  with tab1:
164
+ st.write(output_summary)
165
 
166
  with tab2:
167
  if docs:
168
+ st.text_area("Original Content", docs[0].page_content, height=300)
169
 
170
  except Exception as e:
171
+ st.error(f"An error occurred: {str(e)}")
172
+ st.exception(e)