thivy's picture
feat: :sparkles: add scraper as tool
abb8566
from smolagents import CodeAgent,DuckDuckGoSearchTool, HfApiModel,load_tool,tool
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import datetime
import requests
import pytz
import yaml
from tools.final_answer import FinalAnswerTool
from Gradio_UI import GradioUI
def categorize_content(text, categories):
"""Categorizes text using NLP and TF-IDF similarity."""
vectorizer = TfidfVectorizer()
category_texts = list(categories.values())
category_names = list(categories.keys())
tfidf_matrix = vectorizer.fit_transform([text] + category_texts)
similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
return category_names[similarities.argmax()] if similarities.any() else "Uncategorized"
@tool
def scrape_webpage(url:str, categories:dict = None)-> str: #it's import to specify the return type
#Keep this format for the description / args / args description but feel free to modify the tool
"""A tool that scrapes a webpage and categorizes the content using NLP.
Args:
url: the first argument
categories: A dictionary with category names as keys and example text as values.
"""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
text_content = ' '.join(soup.stripped_strings)
if categories:
category = categorize_content(text_content, categories)
return f"The following text content {text_content} was scaped from {url} and categorized as: {category}"
else:
return "The following text content was scaped: %s" % text_content
except requests.RequestException as e:
return f"Error fetching webpage: {str(e)}"
@tool
def get_current_time_in_timezone(timezone: str) -> str:
"""A tool that fetches the current local time in a specified timezone.
Args:
timezone: A string representing a valid timezone (e.g., 'America/New_York').
"""
try:
# Create timezone object
tz = pytz.timezone(timezone)
# Get current time in that timezone
local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
return f"The current local time in {timezone} is: {local_time}"
except Exception as e:
return f"Error fetching time for timezone '{timezone}': {str(e)}"
final_answer = FinalAnswerTool()
model = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id='Qwen/Qwen2.5-Coder-32B-Instruct',# it is possible that this model may be overloaded
custom_role_conversions=None,
)
# Import tool from Hub
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
with open("prompts.yaml", 'r') as stream:
prompt_templates = yaml.safe_load(stream)
agent = CodeAgent(
model=model,
tools=[final_answer, scrape_webpage], ## add your tools here (don't remove final answer)
max_steps=6,
verbosity_level=1,
grammar=None,
planning_interval=None,
name=None,
description=None,
prompt_templates=prompt_templates
)
GradioUI(agent).launch()