jarif commited on
Commit
2fc5eb1
·
verified ·
1 Parent(s): 4ab3b79

Upload 3 files

Browse files
Files changed (3) hide show
  1. .env +3 -0
  2. app.py +172 -0
  3. requirements.txt +0 -0
.env ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ GROQ_API_KEY="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"
2
+ LANGCHAIN_API_KEY="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"
3
+ # OPENAI_API_KEY=""
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import validators
3
+ import streamlit as st
4
+ from dotenv import load_dotenv
5
+ from langchain.prompts import PromptTemplate
6
+ from langchain_groq import ChatGroq
7
+ from langchain.chains.summarize import load_summarize_chain
8
+ from langchain.schema import Document
9
+ import yt_dlp
10
+ import requests
11
+ from bs4 import BeautifulSoup
12
+ import re
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+
17
+ # Streamlit App
18
+ st.set_page_config(page_title="LangChain: Summarize Text From YT or Website", page_icon="🦜")
19
+ st.title("🦜 LangChain: Summarize Text From YT or Website")
20
+ st.subheader("Summarize URL")
21
+
22
+ # Get API Key & URL input
23
+ groq_api_key = os.getenv("GROQ_API_KEY")
24
+ if not groq_api_key:
25
+ st.error("GROQ API Key not found. Please check your environment variables.")
26
+
27
+ generic_url = st.text_input("Enter YouTube or Website URL", label_visibility="collapsed")
28
+
29
+ # LangChain Model with Groq API
30
+ llm = ChatGroq(model="gemma2-9b-it", groq_api_key=groq_api_key)
31
+
32
+ # Prompt Template
33
+ prompt_template = """
34
+ Provide a clear and concise summary in 300 words of the following content:
35
+
36
+ {text}
37
+
38
+ Focus on the main points and key insights. Write in a professional tone.
39
+ """
40
+ prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
41
+
42
+ def get_youtube_content(url):
43
+ """Get content from YouTube video using yt-dlp"""
44
+ try:
45
+ ydl_opts = {
46
+ 'format': 'worst',
47
+ 'extract_flat': True,
48
+ 'quiet': True,
49
+ 'no_warnings': True
50
+ }
51
+
52
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
53
+ info = ydl.extract_info(url, download=False)
54
+
55
+ title = info.get('title', '')
56
+ description = info.get('description', '')
57
+ views = info.get('view_count', 'Unknown')
58
+ uploader = info.get('uploader', 'Unknown')
59
+ upload_date = info.get('upload_date', 'Unknown')
60
+
61
+ content = f"""
62
+ Video Title: {title}
63
+ Uploader: {uploader}
64
+ Upload Date: {upload_date}
65
+ Views: {views}
66
+
67
+ Description:
68
+ {description}
69
+ """
70
+ return [Document(page_content=content)]
71
+
72
+ except Exception as e:
73
+ st.error(f"Error getting YouTube content: {str(e)}")
74
+ return None
75
+
76
+ def get_website_content(url):
77
+ """Get content from website using requests and BeautifulSoup"""
78
+ try:
79
+ # Send request with headers to mimic a browser
80
+ headers = {
81
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
82
+ }
83
+ response = requests.get(url, headers=headers, verify=False)
84
+ response.raise_for_status()
85
+
86
+ # Parse HTML
87
+ soup = BeautifulSoup(response.text, 'html.parser')
88
+
89
+ # Remove script and style elements
90
+ for script in soup(["script", "style"]):
91
+ script.decompose()
92
+
93
+ # Get title
94
+ title = soup.title.string if soup.title else "No title found"
95
+
96
+ # Get main content (adjust selectors based on the website structure)
97
+ main_content = ""
98
+
99
+ # Try to find article content first
100
+ article = soup.find('article')
101
+ if article:
102
+ main_content = article.get_text()
103
+ else:
104
+ # If no article tag, try common content containers
105
+ content_tags = ['main', 'div.content', 'div.post-content', 'div.article-content']
106
+ for tag in content_tags:
107
+ element = soup.select_one(tag)
108
+ if element:
109
+ main_content = element.get_text()
110
+ break
111
+
112
+ # If still no content, get all paragraph text
113
+ if not main_content:
114
+ paragraphs = soup.find_all('p')
115
+ main_content = '\n'.join(p.get_text() for p in paragraphs)
116
+
117
+ # Clean up the text
118
+ # Remove extra whitespace and newlines
119
+ main_content = re.sub(r'\s+', ' ', main_content).strip()
120
+ # Remove any remaining HTML tags
121
+ main_content = re.sub(r'<[^>]+>', '', main_content)
122
+
123
+ content = f"""
124
+ Title: {title}
125
+ URL: {url}
126
+
127
+ Content:
128
+ {main_content}
129
+ """
130
+ return [Document(page_content=content)]
131
+
132
+ except Exception as e:
133
+ st.error(f"Error fetching or processing {url}, exception:\n{str(e)}")
134
+ return None
135
+
136
+ if st.button("Summarize the Content from YT or Website"):
137
+ # Validate Input
138
+ if not groq_api_key or not generic_url.strip():
139
+ st.error("Please provide a valid API key and URL.")
140
+ elif not validators.url(generic_url):
141
+ st.error("Please enter a valid URL (YouTube or a website).")
142
+ else:
143
+ try:
144
+ with st.spinner("Fetching content and summarizing..."):
145
+ # Load data from YouTube or Website
146
+ if "youtube.com" in generic_url or "youtu.be" in generic_url:
147
+ docs = get_youtube_content(generic_url)
148
+ else:
149
+ docs = get_website_content(generic_url)
150
+
151
+ if docs is None:
152
+ st.stop()
153
+
154
+ # Create the summary chain and run it
155
+ chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
156
+ output_summary = chain.run(docs)
157
+
158
+ # Display the results
159
+ st.success("Summary Generated Successfully!")
160
+
161
+ tab1, tab2 = st.tabs(["Summary", "Raw Content"])
162
+
163
+ with tab1:
164
+ st.write(output_summary)
165
+
166
+ with tab2:
167
+ if docs:
168
+ st.text_area("Original Content", docs[0].page_content, height=300)
169
+
170
+ except Exception as e:
171
+ st.error(f"An error occurred: {str(e)}")
172
+ st.exception(e)
requirements.txt ADDED
Binary file (11.6 kB). View file