jerpint commited on
Commit
1475390
·
1 Parent(s): 738ac70

update tiktoken requirement

Browse files
Files changed (2) hide show
  1. embed_docs.py +17 -6
  2. requirements.txt +1 -1
embed_docs.py CHANGED
@@ -3,6 +3,7 @@ import os
3
 
4
  from buster.documents_manager import DeepLakeDocumentsManager
5
  from buster.parsers import SphinxParser, get_all_documents
 
6
 
7
  from rtd_scraper.scrape_rtd import sanitize_url, run_spider
8
 
@@ -12,7 +13,7 @@ for name in logging.root.manager.loggerDict:
12
  logger.setLevel(logging.INFO)
13
 
14
 
15
- def embed_documents(homepage_url, save_directory, target_version=None):
16
  # adds https:// and trailing slash
17
  homepage_url = sanitize_url(homepage_url)
18
 
@@ -21,6 +22,11 @@ def embed_documents(homepage_url, save_directory, target_version=None):
21
  homepage_url, save_directory=save_directory, target_version=target_version
22
  )
23
 
 
 
 
 
 
24
  # # Convert the .html pages into chunks using Buster's SphinxParser
25
  # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
26
  root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
@@ -31,7 +37,7 @@ def embed_documents(homepage_url, save_directory, target_version=None):
31
  min_section_length=100,
32
  max_section_length=1000,
33
  )
34
- df["source"] = "readthedocs" # Add the source column
35
 
36
  # Initialize the DeepLake vector store
37
  vector_store_path = os.path.join(save_directory, "deeplake_store")
@@ -49,12 +55,17 @@ def embed_documents(homepage_url, save_directory, target_version=None):
49
  num_workers=32,
50
  )
51
 
 
 
 
 
 
52
 
53
  if __name__ == "__main__":
54
- homepage_url = "https://orion.readthedocs.io/"
55
- target_version = "v0.2.7"
56
- save_directory = "outputs/"
57
- embed_documents(
58
  homepage_url=homepage_url,
59
  target_version=target_version,
60
  save_directory=save_directory,
 
3
 
4
  from buster.documents_manager import DeepLakeDocumentsManager
5
  from buster.parsers import SphinxParser, get_all_documents
6
+ from buster.utils import zip_contents
7
 
8
  from rtd_scraper.scrape_rtd import sanitize_url, run_spider
9
 
 
13
  logger.setLevel(logging.INFO)
14
 
15
 
16
+ def crawl_docs(homepage_url, save_directory, target_version=None):
17
  # adds https:// and trailing slash
18
  homepage_url = sanitize_url(homepage_url)
19
 
 
22
  homepage_url, save_directory=save_directory, target_version=target_version
23
  )
24
 
25
+
26
+ def embed_documents(homepage_url, save_directory):
27
+ # # adds https:// and trailing slash
28
+ homepage_url = sanitize_url(homepage_url)
29
+
30
  # # Convert the .html pages into chunks using Buster's SphinxParser
31
  # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
32
  root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
 
37
  min_section_length=100,
38
  max_section_length=1000,
39
  )
40
+ df["source"] = homepage_url # Add the source column
41
 
42
  # Initialize the DeepLake vector store
43
  vector_store_path = os.path.join(save_directory, "deeplake_store")
 
55
  num_workers=32,
56
  )
57
 
58
+ def crawl_and_embed_docs(homepage_url, save_directory, target_version=None):
59
+ # crawl_docs(homepage_url, save_directory, target_version)
60
+ # embed_documents(homepage_url, save_directory)
61
+ zip_contents(save_directory, output_path=".")
62
+
63
 
64
  if __name__ == "__main__":
65
+ homepage_url = "https://docs.mila.quebec/"
66
+ target_version = ""
67
+ save_directory = "outputs"
68
+ crawl_and_embed_docs(
69
  homepage_url=homepage_url,
70
  target_version=target_version,
71
  save_directory=save_directory,
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  buster-doctalk==1.0.28
2
  gradio==4.39.0
3
- scrapy
 
1
  buster-doctalk==1.0.28
2
  gradio==4.39.0
3
+ tiktoken==0.7.0