thivy commited on
Commit
abb8566
·
1 Parent(s): 3d1237b

feat: :sparkles: add scraper as tool

Browse files
Files changed (1) hide show
  1. app.py +32 -7
app.py CHANGED
@@ -1,4 +1,9 @@
1
  from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
 
 
 
 
 
2
  import datetime
3
  import requests
4
  import pytz
@@ -7,16 +12,36 @@ from tools.final_answer import FinalAnswerTool
7
 
8
  from Gradio_UI import GradioUI
9
 
10
- # Below is an example of a tool that does nothing. Amaze us with your creativity !
 
 
 
 
 
 
 
 
 
11
  @tool
12
- def my_custom_tool(arg1:str, arg2:int)-> str: #it's import to specify the return type
13
  #Keep this format for the description / args / args description but feel free to modify the tool
14
- """A tool that does nothing yet
15
  Args:
16
- arg1: the first argument
17
- arg2: the second argument
18
  """
19
- return "What magic will you build ?"
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  @tool
22
  def get_current_time_in_timezone(timezone: str) -> str:
@@ -51,7 +76,7 @@ with open("prompts.yaml", 'r') as stream:
51
 
52
  agent = CodeAgent(
53
  model=model,
54
- tools=[final_answer], ## add your tools here (don't remove final answer)
55
  max_steps=6,
56
  verbosity_level=1,
57
  grammar=None,
 
1
  from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
2
+
3
+ from bs4 import BeautifulSoup
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
  import datetime
8
  import requests
9
  import pytz
 
12
 
13
  from Gradio_UI import GradioUI
14
 
15
+ def categorize_content(text, categories):
16
+ """Categorizes text using NLP and TF-IDF similarity."""
17
+ vectorizer = TfidfVectorizer()
18
+ category_texts = list(categories.values())
19
+ category_names = list(categories.keys())
20
+ tfidf_matrix = vectorizer.fit_transform([text] + category_texts)
21
+ similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
22
+ return category_names[similarities.argmax()] if similarities.any() else "Uncategorized"
23
+
24
+
25
  @tool
26
+ def scrape_webpage(url:str, categories:dict = None)-> str: #it's import to specify the return type
27
  #Keep this format for the description / args / args description but feel free to modify the tool
28
+ """A tool that scrapes a webpage and categorizes the content using NLP.
29
  Args:
30
+ url: the first argument
31
+ categories: A dictionary with category names as keys and example text as values.
32
  """
33
+ try:
34
+ response = requests.get(url, timeout=10)
35
+ response.raise_for_status()
36
+ soup = BeautifulSoup(response.text, "html.parser")
37
+ text_content = ' '.join(soup.stripped_strings)
38
+ if categories:
39
+ category = categorize_content(text_content, categories)
40
+ return f"The following text content {text_content} was scaped from {url} and categorized as: {category}"
41
+ else:
42
+ return "The following text content was scaped: %s" % text_content
43
+ except requests.RequestException as e:
44
+ return f"Error fetching webpage: {str(e)}"
45
 
46
  @tool
47
  def get_current_time_in_timezone(timezone: str) -> str:
 
76
 
77
  agent = CodeAgent(
78
  model=model,
79
+ tools=[final_answer, scrape_webpage], ## add your tools here (don't remove final answer)
80
  max_steps=6,
81
  verbosity_level=1,
82
  grammar=None,