Spaces:
Runtime error
Runtime error
import logging | |
import os | |
from buster.docparser import get_all_documents | |
from buster.documents_manager import DeepLakeDocumentsManager | |
from buster.parser import SphinxParser | |
from scrapy.crawler import CrawlerProcess | |
from scrapy.exceptions import CloseSpider | |
from scrapy.utils.project import get_project_settings | |
from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider | |
# When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here... | |
for name in logging.root.manager.loggerDict: | |
logger = logging.getLogger(name) | |
logger.setLevel(logging.INFO) | |
def run_spider(homepage_url, save_directory): | |
# settings_file_path = 'rtd_scraper.tutorial.settings' # The path seen from top-level, ie. from cfg.py | |
# os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path) | |
process = CrawlerProcess(settings=get_project_settings()) | |
process.crawl(DocsSpider, homepage_url=homepage_url, save_dir=save_directory) | |
# Start the crawling process | |
process.start() | |
# To stop the crawling process gracefully | |
process.stop() | |
def scrape_rtd(homepage_url, save_directory): | |
# Crawl the website using scrapy | |
run_spider(homepage_url, save_directory=save_directory) | |
# Convert the .html pages into chunks using Buster's SphinxParser | |
root_dir = os.path.join(save_directory, homepage_url.split("https://")[1]) | |
# root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/ | |
df = get_all_documents( | |
root_dir=root_dir, | |
base_url=homepage_url, | |
parser_cls=SphinxParser, | |
min_section_length=100, | |
max_section_length=1000, | |
) | |
# Add the source column | |
df["source"] = "readthedocs" | |
# # Initialize the DeepLake vector store | |
# dm = DeepLakeDocumentsManager( | |
# vector_store_path=os.path.join(save_directory, "deeplake_store"), | |
# overwrite=True, | |
# required_columns=["url", "content", "source", "title"], | |
# ) | |
# | |
# # Add all embeddings to the vector store | |
# dm.batch_add( | |
# df=df, | |
# batch_size=3000, | |
# min_time_interval=60, | |
# num_workers=32, | |
# ) | |
# | |
if __name__ == "__main__": | |
homepage_url = "https://buster.readthedocs.io/" | |
scrape_rtd(homepage_url=homepage_url, save_directory="outputs/") | |