jarif commited on
Commit
8f7a009
·
verified ·
1 Parent(s): b658b8b

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -73
app.py CHANGED
@@ -6,7 +6,8 @@ from langchain.prompts import PromptTemplate
6
  from langchain_groq import ChatGroq
7
  from langchain.chains.summarize import load_summarize_chain
8
  from langchain.schema import Document
9
- import yt_dlp
 
10
  import requests
11
  from bs4 import BeautifulSoup
12
  import re
@@ -15,19 +16,31 @@ import re
15
  load_dotenv()
16
 
17
  # Streamlit App
18
- st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜")
19
- st.title("🦜 LangChain: Summarize Text From YT or Website")
20
- st.subheader("Summarize URL")
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Get API Key & URL input
23
  groq_api_key = os.getenv("GROQ_API_KEY")
24
  if not groq_api_key:
25
  st.error("GROQ API Key not found. Please check your environment variables.")
26
 
27
- generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed")
28
 
29
  # LangChain Model with Groq API
30
- llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)
31
 
32
  # Prompt Template
33
  prompt_template = """
@@ -39,36 +52,44 @@ Focus on the main points and key insights. Write in a professional tone.
39
  """
40
  prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
41
 
 
 
 
 
 
 
 
 
 
42
  def get_youtube_content(url):
43
- """Get content from YouTube video using yt-dlp"""
44
  try:
45
- ydl_opts = {
46
- 'format': 'worst',
47
- 'extract_flat': True,
48
- 'quiet': True,
49
- 'no_warnings': True
50
- }
51
-
52
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
53
- info = ydl.extract_info(url, download=False)
54
-
55
- title = info.get('title', '')
56
- description = info.get('description', '')
57
- views = info.get('view_count', 'Unknown')
58
- uploader = info.get('uploader', 'Unknown')
59
- upload_date = info.get('upload_date', 'Unknown')
60
-
61
- content = f"""
62
- Video Title: {title}
63
- Uploader: {uploader}
64
- Upload Date: {upload_date}
65
- Views: {views}
66
 
67
- Description:
68
- {description}
 
 
 
69
  """
70
- return [Document(page_content=content)]
71
-
72
  except Exception as e:
73
  st.error(f"Error getting YouTube content: {str(e)}")
74
  return None
@@ -76,48 +97,42 @@ Description:
76
  def get_website_content(url):
77
  """Get content from website using requests and BeautifulSoup"""
78
  try:
79
- # Send request with headers to mimic a browser
80
  headers = {
81
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
82
  }
83
  response = requests.get(url, headers=headers, verify=False)
84
  response.raise_for_status()
85
 
86
- # Parse HTML
87
  soup = BeautifulSoup(response.text, 'html.parser')
88
 
89
- # Remove script and style elements
90
- for script in soup(["script", "style"]):
91
- script.decompose()
92
 
93
- # Get title
94
  title = soup.title.string if soup.title else "No title found"
95
 
96
- # Get main content (adjust selectors based on the website structure)
97
  main_content = ""
 
 
 
 
 
 
 
 
98
 
99
- # Try to find article content first
100
- article = soup.find('article')
101
- if article:
102
- main_content = article.get_text()
103
- else:
104
- # If no article tag, try common content containers
105
- content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content']
106
- for tag in content_tags:
107
- element = soup.select_one(tag)
108
- if element:
109
- main_content = element.get_text()
110
- break
111
-
112
- # If still no content, get all paragraph text
113
- if not main_content:
114
- paragraphs = soup.find_all('p')
115
- main_content = '\n'.join(p.get_text() for p in paragraphs)
116
 
117
- # Clean up the text
118
- # Remove extra whitespace and newlines
119
  main_content = re.sub(r'\s+', ' ', main_content).strip()
120
- # Remove any remaining HTML tags
121
  main_content = re.sub(r'<[^>]+>', '', main_content)
122
 
123
  content = f"""
@@ -130,19 +145,17 @@ Content:
130
  return [Document(page_content=content)]
131
 
132
  except Exception as e:
133
- st.error(f"Error fetching or processing {url}, exception:\n{str(e)}")
134
  return None
135
 
136
- if st.button("Summarize the Content from YT or Website"):
137
- # Validate Input
138
  if not groq_api_key or not generic_url.strip():
139
  st.error("Please provide a valid API key and URL.")
140
  elif not validators.url(generic_url):
141
- st.error("Please enter a valid URL (YouTube or a website).")
142
  else:
143
  try:
144
- with st.spinner("Fetching content and summarizing..."):
145
- # Load data from YouTube or Website
146
  if "youtube.com" in generic_url or "youtu.be" in generic_url:
147
  docs = get_youtube_content(generic_url)
148
  else:
@@ -151,22 +164,19 @@ if st.button("Summarize the Content from YT or Website"):
151
  if docs is None:
152
  st.stop()
153
 
154
- # Create the summary chain and run it
155
  chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
156
  output_summary = chain.run(docs)
157
 
158
- # Display the results
159
- st.success("Summary Generated Successfully!")
160
 
161
- tab1, tab2 = st.tabs(["Summary", "Raw Content"])
162
 
163
  with tab1:
164
- st.write(output_summary)
165
 
166
  with tab2:
167
  if docs:
168
- st.text_area("Original Content", docs[0].page_content, height=300)
169
 
170
  except Exception as e:
171
- st.error(f"An error occurred: {str(e)}")
172
- st.exception(e)
 
6
  from langchain_groq import ChatGroq
7
  from langchain.chains.summarize import load_summarize_chain
8
  from langchain.schema import Document
9
+ from youtube_transcript_api import YouTubeTranscriptApi
10
+ from urllib.parse import urlparse, parse_qs
11
  import requests
12
  from bs4 import BeautifulSoup
13
  import re
 
16
  load_dotenv()
17
 
18
  # Streamlit App
19
+ st.set_page_config(page_title="AI Content Summarizer", page_icon="📚")
20
+
21
+ # Create two columns for the title
22
+ col1, col2 = st.columns([0.85, 0.15])
23
+ with col1:
24
+ st.title("AI Content Summarizer")
25
+ st.caption("Powered by LangChain & Gemma 🤖")
26
+
27
+ with col2:
28
+ st.image("https://python.langchain.com/img/favicon.ico", width=50)
29
+
30
+ st.markdown("""
31
+ ### About This App
32
+ This application leverages the power of LangChain and Gemma AI to automatically generate concise summaries from YouTube videos and web articles. Whether you're researching a topic, catching up on content, or trying to quickly grasp key information, this tool can help save time by distilling content into clear, readable summaries.
33
+ """)
34
 
35
  # Get API Key & URL input
36
  groq_api_key = os.getenv("GROQ_API_KEY")
37
  if not groq_api_key:
38
  st.error("GROQ API Key not found. Please check your environment variables.")
39
 
40
+ generic_url = st.text_input("Enter YouTube or Website URL", placeholder="https://example.com or https://youtube.com/watch?v=...")
41
 
42
  # LangChain Model with Groq API
43
+ llm = ChatGroq(model="gemma-7b-it", groq_api_key=groq_api_key)
44
 
45
  # Prompt Template
46
  prompt_template = """
 
52
  """
53
  prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
54
 
55
+ def get_youtube_id(url):
56
+ """Extract video ID from YouTube URL"""
57
+ if 'youtube.com' in url:
58
+ query = parse_qs(urlparse(url).query)
59
+ return query.get('v', [None])[0]
60
+ elif 'youtu.be' in url:
61
+ return urlparse(url).path[1:]
62
+ return None
63
+
64
  def get_youtube_content(url):
65
+ """Get content from YouTube video using youtube-transcript-api"""
66
  try:
67
+ video_id = get_youtube_id(url)
68
+ if not video_id:
69
+ raise ValueError("Could not extract YouTube video ID")
70
+
71
+ # Get transcript
72
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
73
+ transcript_text = ' '.join([entry['text'] for entry in transcript])
74
+
75
+ # Get video info using a simple request
76
+ response = requests.get(f"https://www.youtube.com/oembed?url=https://www.youtube.com/watch?v={video_id}&format=json")
77
+ if response.status_code == 200:
78
+ video_info = response.json()
79
+ title = video_info.get('title', 'Unknown Title')
80
+ author = video_info.get('author_name', 'Unknown Author')
81
+ else:
82
+ title = "Unknown Title"
83
+ author = "Unknown Author"
 
 
 
 
84
 
85
+ content = f"""
86
+ Video Title: {title}
87
+ Channel: {author}
88
+ Transcript:
89
+ {transcript_text}
90
  """
91
+ return [Document(page_content=content)]
92
+
93
  except Exception as e:
94
  st.error(f"Error getting YouTube content: {str(e)}")
95
  return None
 
97
  def get_website_content(url):
98
  """Get content from website using requests and BeautifulSoup"""
99
  try:
 
100
  headers = {
101
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
102
  }
103
  response = requests.get(url, headers=headers, verify=False)
104
  response.raise_for_status()
105
 
 
106
  soup = BeautifulSoup(response.text, 'html.parser')
107
 
108
+ # Remove unwanted elements
109
+ for element in soup(['script', 'style', 'header', 'footer', 'nav']):
110
+ element.decompose()
111
 
 
112
  title = soup.title.string if soup.title else "No title found"
113
 
114
+ # Get main content with improved selection
115
  main_content = ""
116
+ selectors = [
117
+ 'article', 'main',
118
+ '[role="main"]',
119
+ '.post-content',
120
+ '.article-content',
121
+ '.content',
122
+ '#content'
123
+ ]
124
 
125
+ for selector in selectors:
126
+ if soup.select_one(selector):
127
+ main_content = soup.select_one(selector).get_text()
128
+ break
129
+
130
+ if not main_content:
131
+ paragraphs = soup.find_all('p')
132
+ main_content = '\n'.join(p.get_text() for p in paragraphs)
 
 
 
 
 
 
 
 
 
133
 
134
+ # Clean up text
 
135
  main_content = re.sub(r'\s+', ' ', main_content).strip()
 
136
  main_content = re.sub(r'<[^>]+>', '', main_content)
137
 
138
  content = f"""
 
145
  return [Document(page_content=content)]
146
 
147
  except Exception as e:
148
+ st.error(f"Error processing website content: {str(e)}")
149
  return None
150
 
151
+ if st.button("Summarize"):
 
152
  if not groq_api_key or not generic_url.strip():
153
  st.error("Please provide a valid API key and URL.")
154
  elif not validators.url(generic_url):
155
+ st.error("Please enter a valid URL.")
156
  else:
157
  try:
158
+ with st.spinner("Processing content..."):
 
159
  if "youtube.com" in generic_url or "youtu.be" in generic_url:
160
  docs = get_youtube_content(generic_url)
161
  else:
 
164
  if docs is None:
165
  st.stop()
166
 
 
167
  chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
168
  output_summary = chain.run(docs)
169
 
170
+ st.success("Summary Generated!")
 
171
 
172
+ tab1, tab2 = st.tabs(["Summary", "Original Content"])
173
 
174
  with tab1:
175
+ st.markdown(output_summary)
176
 
177
  with tab2:
178
  if docs:
179
+ st.text_area("Raw Content", docs[0].page_content, height=300)
180
 
181
  except Exception as e:
182
+ st.error(f"An error occurred: {str(e)}")