Shreyas094 commited on
Commit
e2808f2
·
verified ·
1 Parent(s): 0a74a16

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -3
app.py CHANGED
@@ -8,6 +8,16 @@ import os
8
  from dotenv import load_dotenv
9
  import shutil
10
  import tempfile
 
 
 
 
 
 
 
 
 
 
11
 
12
  load_dotenv() # Load environment variables from .env file
13
 
@@ -116,6 +126,8 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
116
  if not result_block:
117
  print("No more results found.")
118
  break
 
 
119
  for result in result_block:
120
  link = result.find("a", href=True)
121
  if link:
@@ -125,9 +137,13 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
125
  webpage = session.get(link, headers=headers, timeout=timeout)
126
  webpage.raise_for_status()
127
  visible_text = extract_text_from_webpage(webpage.text)
128
- if len(visible_text) > max_chars_per_page:
129
- visible_text = visible_text[:max_chars_per_page] + "..."
130
- all_results.append({"link": link, "text": visible_text})
 
 
 
 
131
  except requests.exceptions.RequestException as e:
132
  print(f"Error fetching or processing {link}: {e}")
133
  all_results.append({"link": link, "text": None})
@@ -138,6 +154,91 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
138
  print(f"Total results fetched: {len(all_results)}")
139
  return all_results
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  # Function to format the prompt for the Hugging Face API
142
  def format_prompt(query, search_results, instructions):
143
  formatted_results = ""
 
8
  from dotenv import load_dotenv
9
  import shutil
10
  import tempfile
11
+ import re
12
+ import unicodedata
13
+ from nltk.corpus import stopwords
14
+ from nltk.tokenize import sent_tokenize, word_tokenize
15
+ from nltk.probability import FreqDist
16
+ import nltk
17
+
18
+ # Download necessary NLTK data
19
+ nltk.download('punkt')
20
+ nltk.download('stopwords')
21
 
22
  load_dotenv() # Load environment variables from .env file
23
 
 
126
  if not result_block:
127
  print("No more results found.")
128
  break
129
+ keywords = term.split() # Use the search term as keywords for filtering
130
+
131
  for result in result_block:
132
  link = result.find("a", href=True)
133
  if link:
 
137
  webpage = session.get(link, headers=headers, timeout=timeout)
138
  webpage.raise_for_status()
139
  visible_text = extract_text_from_webpage(webpage.text)
140
+
141
+ # Apply preprocessing to the visible text
142
+ preprocessed_text = preprocess_web_content(visible_text, keywords)
143
+
144
+ if len(preprocessed_text) > max_chars_per_page:
145
+ preprocessed_text = preprocessed_text[:max_chars_per_page] + "..."
146
+ all_results.append({"link": link, "text": preprocessed_text})
147
  except requests.exceptions.RequestException as e:
148
  print(f"Error fetching or processing {link}: {e}")
149
  all_results.append({"link": link, "text": None})
 
154
  print(f"Total results fetched: {len(all_results)}")
155
  return all_results
156
 
157
+ def preprocess_text(text):
158
+ # Remove HTML tags
159
+ text = BeautifulSoup(text, "html.parser").get_text()
160
+
161
+ # Remove URLs
162
+ text = re.sub(r'http\S+|www.\S+', '', text)
163
+
164
+ # Remove special characters and digits
165
+ text = re.sub(r'[^a-zA-Z\s]', '', text)
166
+
167
+ # Remove extra whitespace
168
+ text = ' '.join(text.split())
169
+
170
+ # Convert to lowercase
171
+ text = text.lower()
172
+
173
+ return text
174
+
175
+ def remove_boilerplate(text):
176
+ # List of common boilerplate phrases to remove
177
+ boilerplate = [
178
+ "all rights reserved",
179
+ "terms of service",
180
+ "privacy policy",
181
+ "cookie policy",
182
+ "copyright ©",
183
+ "follow us on social media"
184
+ ]
185
+
186
+ for phrase in boilerplate:
187
+ text = text.replace(phrase, '')
188
+
189
+ return text
190
+
191
+ def keyword_filter(text, keywords):
192
+ sentences = sent_tokenize(text)
193
+ filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
194
+ return ' '.join(filtered_sentences)
195
+
196
+ def summarize_text(text, num_sentences=3):
197
+ # Tokenize the text into words
198
+ words = word_tokenize(text)
199
+
200
+ # Remove stopwords
201
+ stop_words = set(stopwords.words('english'))
202
+ words = [word for word in words if word.lower() not in stop_words]
203
+
204
+ # Calculate word frequencies
205
+ freq_dist = FreqDist(words)
206
+
207
+ # Score sentences based on word frequencies
208
+ sentences = sent_tokenize(text)
209
+ sentence_scores = {}
210
+ for sentence in sentences:
211
+ for word in word_tokenize(sentence.lower()):
212
+ if word in freq_dist:
213
+ if sentence not in sentence_scores:
214
+ sentence_scores[sentence] = freq_dist[word]
215
+ else:
216
+ sentence_scores[sentence] += freq_dist[word]
217
+
218
+ # Get the top N sentences with highest scores
219
+ summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
220
+
221
+ # Sort the selected sentences in the order they appear in the original text
222
+ summary_sentences = sorted(summary_sentences, key=text.index)
223
+
224
+ return ' '.join(summary_sentences)
225
+
226
+ def preprocess_web_content(content, keywords):
227
+ # Apply basic preprocessing
228
+ preprocessed_text = preprocess_text(content)
229
+
230
+ # Remove boilerplate
231
+ preprocessed_text = remove_boilerplate(preprocessed_text)
232
+
233
+ # Apply keyword filtering
234
+ filtered_text = keyword_filter(preprocessed_text, keywords)
235
+
236
+ # Summarize the text
237
+ summarized_text = summarize_text(filtered_text)
238
+
239
+ return summarized_text
240
+
241
+
242
  # Function to format the prompt for the Hugging Face API
243
  def format_prompt(query, search_results, instructions):
244
  formatted_results = ""