Sentinel-AI-Beta-Test

Sleeping

App Files Files Community

Shreyas094 commited on Jun 29, 2024

Commit

e2808f2

verified ·

1 Parent(s): 0a74a16

Update app.py

Browse files

Files changed (1) hide show

app.py +104 -3

app.py CHANGED Viewed

@@ -8,6 +8,16 @@ import os
 from dotenv import load_dotenv
 import shutil
 import tempfile
 load_dotenv()  # Load environment variables from .env file
@@ -116,6 +126,8 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
             if not result_block:
                 print("No more results found.")
                 break
             for result in result_block:
                 link = result.find("a", href=True)
                 if link:
@@ -125,9 +137,13 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
                         webpage = session.get(link, headers=headers, timeout=timeout)
                         webpage.raise_for_status()
                         visible_text = extract_text_from_webpage(webpage.text)
-                        if len(visible_text) > max_chars_per_page:
-                            visible_text = visible_text[:max_chars_per_page] + "..."
-                        all_results.append({"link": link, "text": visible_text})
                     except requests.exceptions.RequestException as e:
                         print(f"Error fetching or processing {link}: {e}")
                         all_results.append({"link": link, "text": None})
@@ -138,6 +154,91 @@ def google_search(term, num_results=5, lang="en", timeout=5, safe="active", ssl_
     print(f"Total results fetched: {len(all_results)}")
     return all_results
 # Function to format the prompt for the Hugging Face API
 def format_prompt(query, search_results, instructions):
     formatted_results = ""

 from dotenv import load_dotenv
 import shutil
 import tempfile
+import re
+import unicodedata
+from nltk.corpus import stopwords
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.probability import FreqDist
+import nltk
+# Download necessary NLTK data
+nltk.download('punkt')
+nltk.download('stopwords')
 load_dotenv()  # Load environment variables from .env file
             if not result_block:
                 print("No more results found.")
                 break
+             keywords = term.split()  # Use the search term as keywords for filtering
             for result in result_block:
                 link = result.find("a", href=True)
                 if link:
                         webpage = session.get(link, headers=headers, timeout=timeout)
                         webpage.raise_for_status()
                         visible_text = extract_text_from_webpage(webpage.text)
+                        # Apply preprocessing to the visible text
+                        preprocessed_text = preprocess_web_content(visible_text, keywords)
+                        if len(preprocessed_text) > max_chars_per_page:
+                            preprocessed_text = preprocessed_text[:max_chars_per_page] + "..."
+                        all_results.append({"link": link, "text": preprocessed_text})
                     except requests.exceptions.RequestException as e:
                         print(f"Error fetching or processing {link}: {e}")
                         all_results.append({"link": link, "text": None})
     print(f"Total results fetched: {len(all_results)}")
     return all_results
+def preprocess_text(text):
+    # Remove HTML tags
+    text = BeautifulSoup(text, "html.parser").get_text()
+    # Remove URLs
+    text = re.sub(r'http\S+|www.\S+', '', text)
+    # Remove special characters and digits
+    text = re.sub(r'[^a-zA-Z\s]', '', text)
+    # Remove extra whitespace
+    text = ' '.join(text.split())
+    # Convert to lowercase
+    text = text.lower()
+    return text
+def remove_boilerplate(text):
+    # List of common boilerplate phrases to remove
+    boilerplate = [
+        "all rights reserved",
+        "terms of service",
+        "privacy policy",
+        "cookie policy",
+        "copyright ©",
+        "follow us on social media"
+    ]
+    for phrase in boilerplate:
+        text = text.replace(phrase, '')
+    return text
+def keyword_filter(text, keywords):
+    sentences = sent_tokenize(text)
+    filtered_sentences = [sentence for sentence in sentences if any(keyword.lower() in sentence.lower() for keyword in keywords)]
+    return ' '.join(filtered_sentences)
+def summarize_text(text, num_sentences=3):
+    # Tokenize the text into words
+    words = word_tokenize(text)
+    # Remove stopwords
+    stop_words = set(stopwords.words('english'))
+    words = [word for word in words if word.lower() not in stop_words]
+    # Calculate word frequencies
+    freq_dist = FreqDist(words)
+    # Score sentences based on word frequencies
+    sentences = sent_tokenize(text)
+    sentence_scores = {}
+    for sentence in sentences:
+        for word in word_tokenize(sentence.lower()):
+            if word in freq_dist:
+                if sentence not in sentence_scores:
+                    sentence_scores[sentence] = freq_dist[word]
+                else:
+                    sentence_scores[sentence] += freq_dist[word]
+    # Get the top N sentences with highest scores
+    summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences]
+    # Sort the selected sentences in the order they appear in the original text
+    summary_sentences = sorted(summary_sentences, key=text.index)
+    return ' '.join(summary_sentences)
+def preprocess_web_content(content, keywords):
+    # Apply basic preprocessing
+    preprocessed_text = preprocess_text(content)
+    # Remove boilerplate
+    preprocessed_text = remove_boilerplate(preprocessed_text)
+    # Apply keyword filtering
+    filtered_text = keyword_filter(preprocessed_text, keywords)
+    # Summarize the text
+    summarized_text = summarize_text(filtered_text)
+    return summarized_text
 # Function to format the prompt for the Hugging Face API
 def format_prompt(query, search_results, instructions):
     formatted_results = ""