Spaces:
Runtime error
Runtime error
orion support (#11)
Browse files* better tables
* black
* orion support
* black
- .gitignore +3 -0
- buster/data/document_embeddings.csv +0 -0
- buster/data/documents.csv +0 -0
- buster/docparser.py +7 -6
- requirements.txt +5 -6
.gitignore
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Byte-compiled / optimized / DLL files
|
| 2 |
__pycache__/
|
| 3 |
*.py[cod]
|
|
|
|
| 1 |
+
# Project specific stuff
|
| 2 |
+
buster/data/
|
| 3 |
+
|
| 4 |
# Byte-compiled / optimized / DLL files
|
| 5 |
__pycache__/
|
| 6 |
*.py[cod]
|
buster/data/document_embeddings.csv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
buster/data/documents.csv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
buster/docparser.py
CHANGED
|
@@ -12,7 +12,8 @@ EMBEDDING_MODEL = "text-embedding-ada-002"
|
|
| 12 |
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
|
| 13 |
|
| 14 |
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
|
| 18 |
def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
|
|
@@ -28,13 +29,13 @@ def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
|
|
| 28 |
return section
|
| 29 |
|
| 30 |
|
| 31 |
-
def get_all_documents(root_dir: str, max_section_length: int = 2000) -> pd.DataFrame:
|
| 32 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
| 33 |
|
| 34 |
Sections are broken into subsections if they are longer than `max_section_length`.
|
| 35 |
-
Sections correspond to
|
| 36 |
"""
|
| 37 |
-
files = glob.glob("
|
| 38 |
|
| 39 |
def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
|
| 40 |
found = soup.find_all("a", href=True, class_="headerlink")
|
|
@@ -47,7 +48,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 2000) -> pd.DataF
|
|
| 47 |
section_href = section_soup.find_all("a", href=True, class_="headerlink")
|
| 48 |
|
| 49 |
# If sections has subsections, keep only the part before the first subsection
|
| 50 |
-
if len(section_href) > 1:
|
| 51 |
section_siblings = list(section_soup.section.previous_siblings)[::-1]
|
| 52 |
section = parse_section(section_siblings)
|
| 53 |
else:
|
|
@@ -87,7 +88,7 @@ def get_all_documents(root_dir: str, max_section_length: int = 2000) -> pd.DataF
|
|
| 87 |
sections_file, urls_file, names_file = get_all_subsections(soup)
|
| 88 |
sections.extend(sections_file)
|
| 89 |
|
| 90 |
-
urls_file = [
|
| 91 |
urls.extend(urls_file)
|
| 92 |
|
| 93 |
names.extend(names_file)
|
|
|
|
| 12 |
EMBEDDING_ENCODING = "cl100k_base" # this the encoding for text-embedding-ada-002
|
| 13 |
|
| 14 |
|
| 15 |
+
BASE_URL_MILA = "https://docs.mila.quebec/"
|
| 16 |
+
BASE_URL_ORION = "https://orion.readthedocs.io/en/stable/"
|
| 17 |
|
| 18 |
|
| 19 |
def parse_section(nodes: list[bs4.element.NavigableString]) -> str:
|
|
|
|
| 29 |
return section
|
| 30 |
|
| 31 |
|
| 32 |
+
def get_all_documents(root_dir: str, base_url: str, max_section_length: int = 2000) -> pd.DataFrame:
|
| 33 |
"""Parse all HTML files in `root_dir`, and extract all sections.
|
| 34 |
|
| 35 |
Sections are broken into subsections if they are longer than `max_section_length`.
|
| 36 |
+
Sections correspond to `section` HTML tags that have a headerlink attached.
|
| 37 |
"""
|
| 38 |
+
files = glob.glob("**/*.html", root_dir=root_dir, recursive=True)
|
| 39 |
|
| 40 |
def get_all_subsections(soup: BeautifulSoup) -> tuple[list[str], list[str], list[str]]:
|
| 41 |
found = soup.find_all("a", href=True, class_="headerlink")
|
|
|
|
| 48 |
section_href = section_soup.find_all("a", href=True, class_="headerlink")
|
| 49 |
|
| 50 |
# If sections has subsections, keep only the part before the first subsection
|
| 51 |
+
if len(section_href) > 1 and section_soup.section is not None:
|
| 52 |
section_siblings = list(section_soup.section.previous_siblings)[::-1]
|
| 53 |
section = parse_section(section_siblings)
|
| 54 |
else:
|
|
|
|
| 88 |
sections_file, urls_file, names_file = get_all_subsections(soup)
|
| 89 |
sections.extend(sections_file)
|
| 90 |
|
| 91 |
+
urls_file = [base_url + os.path.basename(file.name) + url for url in urls_file]
|
| 92 |
urls.extend(urls_file)
|
| 93 |
|
| 94 |
names.extend(names_file)
|
requirements.txt
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
-
pandas
|
| 2 |
-
openai[embeddings]
|
| 3 |
bs4
|
|
|
|
| 4 |
numpy
|
| 5 |
-
tiktoken
|
| 6 |
-
openai
|
| 7 |
pandas
|
|
|
|
| 8 |
scikit-learn
|
|
|
|
| 9 |
tenacity
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
| 1 |
bs4
|
| 2 |
+
matplotlib
|
| 3 |
numpy
|
|
|
|
|
|
|
| 4 |
pandas
|
| 5 |
+
plotly
|
| 6 |
scikit-learn
|
| 7 |
+
tabulate
|
| 8 |
tenacity
|
| 9 |
+
tiktoken
|
| 10 |
+
openai[embeddings]
|