Spaces:
Sleeping
Sleeping
remove unused code, add exceptions if variables not set
Browse files- app.py +34 -14
- cfg.py +28 -30
- rtd_scraper/scrape_rtd.py +0 -53
app.py
CHANGED
|
@@ -6,34 +6,54 @@ import pandas as pd
|
|
| 6 |
from buster.completers import Completion
|
| 7 |
|
| 8 |
# from embed_docs import embed_rtd_website
|
| 9 |
-
from rtd_scraper.scrape_rtd import scrape_rtd
|
|
|
|
| 10 |
import cfg
|
| 11 |
from cfg import setup_buster
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
-
if os.getenv("OPENAI_API_KEY") is None:
|
| 16 |
print(
|
| 17 |
-
"Warning: No
|
| 18 |
)
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
# scrape and embed content from readthedocs website
|
| 25 |
-
# comment out if already embedded locally to avoid extra costs
|
| 26 |
-
scrape_rtd(
|
| 27 |
-
homepage_url=homepage_url, save_directory="outputs/", target_version=target_version
|
| 28 |
-
)
|
| 29 |
|
|
|
|
|
|
|
| 30 |
|
| 31 |
-
#
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
|
|
|
| 34 |
buster = setup_buster(cfg.buster_cfg)
|
| 35 |
|
| 36 |
|
|
|
|
| 37 |
def add_user_question(
|
| 38 |
user_question: str, chat_history: Optional[ChatHistory] = None
|
| 39 |
) -> ChatHistory:
|
|
@@ -157,5 +177,5 @@ with demo:
|
|
| 157 |
)
|
| 158 |
|
| 159 |
|
| 160 |
-
demo.queue(concurrency_count=
|
| 161 |
demo.launch(share=False)
|
|
|
|
| 6 |
from buster.completers import Completion
|
| 7 |
|
| 8 |
# from embed_docs import embed_rtd_website
|
| 9 |
+
# from rtd_scraper.scrape_rtd import scrape_rtd
|
| 10 |
+
from embed_docs import embed_documents
|
| 11 |
import cfg
|
| 12 |
from cfg import setup_buster
|
| 13 |
|
| 14 |
+
# Typehint for chatbot history
|
| 15 |
+
ChatHistory = list[list[Optional[str], Optional[str]]]
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# Because this is a one-click deploy app, we will be relying on env. variables being set
|
| 19 |
+
openai_api_key = os.getenv("OPENAI_API_KEY") # Mandatory for app to work
|
| 20 |
+
readthedocs_url = os.getenv("READTHEDOCS_URL") # Mandatory for app to work as intended
|
| 21 |
+
readthedocs_version = os.getenv("READTHEDOCS_VERSION")
|
| 22 |
|
| 23 |
+
if openai_api_key is None:
|
|
|
|
| 24 |
print(
|
| 25 |
+
"Warning: No OPENAI_API_KEY detected. Set it with 'export OPENAI_API_KEY=sk-...'."
|
| 26 |
)
|
| 27 |
|
| 28 |
+
if readthedocs_url is None:
|
| 29 |
+
raise ValueError(
|
| 30 |
+
"No READTHEDOCS_URL detected. Set it with e.g. 'export READTHEDOCS_URL=https://orion.readthedocs.io/'"
|
| 31 |
+
)
|
| 32 |
|
| 33 |
+
if readthedocs_version is None:
|
| 34 |
+
print(
|
| 35 |
+
"""
|
| 36 |
+
Warning: No READTHEDOCS_VERSION detected. If multiple versions of the docs exist, they will all be scraped.
|
| 37 |
+
Set it with e.g. 'export READTHEDOCS_VERSION=en/stable'
|
| 38 |
+
"""
|
| 39 |
+
)
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
# Override to put it anywhere
|
| 43 |
+
save_directory = "outputs/"
|
| 44 |
|
| 45 |
+
# scrape and embed content from readthedocs website
|
| 46 |
+
embed_documents(
|
| 47 |
+
homepage_url=readthedocs_url,
|
| 48 |
+
save_directory=save_directory,
|
| 49 |
+
target_version=readthedocs_version,
|
| 50 |
+
)
|
| 51 |
|
| 52 |
+
# Setup RAG agent
|
| 53 |
buster = setup_buster(cfg.buster_cfg)
|
| 54 |
|
| 55 |
|
| 56 |
+
# Setup Gradio app
|
| 57 |
def add_user_question(
|
| 58 |
user_question: str, chat_history: Optional[ChatHistory] = None
|
| 59 |
) -> ChatHistory:
|
|
|
|
| 177 |
)
|
| 178 |
|
| 179 |
|
| 180 |
+
demo.queue(concurrency_count=8)
|
| 181 |
demo.launch(share=False)
|
cfg.py
CHANGED
|
@@ -6,37 +6,7 @@ from buster.retriever import DeepLakeRetriever, Retriever
|
|
| 6 |
from buster.tokenizers import GPTTokenizer
|
| 7 |
from buster.validators import QuestionAnswerValidator, Validator
|
| 8 |
|
| 9 |
-
from rtd_scraper.scrape_rtd import scrape_rtd
|
| 10 |
-
|
| 11 |
buster_cfg = BusterConfig(
|
| 12 |
-
validator_cfg={
|
| 13 |
-
"unknown_response_templates": [
|
| 14 |
-
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
|
| 15 |
-
],
|
| 16 |
-
"unknown_threshold": 0.85,
|
| 17 |
-
"embedding_model": "text-embedding-ada-002",
|
| 18 |
-
"use_reranking": True,
|
| 19 |
-
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
|
| 20 |
-
"check_question_prompt": """You are an chatbot answering questions on python libraries.
|
| 21 |
-
|
| 22 |
-
Your job is to determine wether or not a question is valid, and should be answered.
|
| 23 |
-
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
|
| 24 |
-
|
| 25 |
-
For example:
|
| 26 |
-
|
| 27 |
-
Q: How can I install the library?
|
| 28 |
-
true
|
| 29 |
-
|
| 30 |
-
Q: What is the meaning of life?
|
| 31 |
-
false
|
| 32 |
-
|
| 33 |
-
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
|
| 34 |
-
"completion_kwargs": {
|
| 35 |
-
"model": "gpt-3.5-turbo",
|
| 36 |
-
"stream": False,
|
| 37 |
-
"temperature": 0,
|
| 38 |
-
},
|
| 39 |
-
},
|
| 40 |
retriever_cfg={
|
| 41 |
"path": "outputs/deeplake_store",
|
| 42 |
"top_k": 3,
|
|
@@ -87,6 +57,34 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
|
|
| 87 |
"Now answer the following question:\n"
|
| 88 |
),
|
| 89 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
)
|
| 91 |
|
| 92 |
|
|
|
|
| 6 |
from buster.tokenizers import GPTTokenizer
|
| 7 |
from buster.validators import QuestionAnswerValidator, Validator
|
| 8 |
|
|
|
|
|
|
|
| 9 |
buster_cfg = BusterConfig(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
retriever_cfg={
|
| 11 |
"path": "outputs/deeplake_store",
|
| 12 |
"top_k": 3,
|
|
|
|
| 57 |
"Now answer the following question:\n"
|
| 58 |
),
|
| 59 |
},
|
| 60 |
+
validator_cfg={
|
| 61 |
+
"unknown_response_templates": [
|
| 62 |
+
"I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
|
| 63 |
+
],
|
| 64 |
+
"unknown_threshold": 0.85,
|
| 65 |
+
"embedding_model": "text-embedding-ada-002",
|
| 66 |
+
"use_reranking": True,
|
| 67 |
+
"invalid_question_response": "This question does not seem relevant to my current knowledge.",
|
| 68 |
+
"check_question_prompt": """You are an chatbot answering questions on python libraries.
|
| 69 |
+
|
| 70 |
+
Your job is to determine wether or not a question is valid, and should be answered.
|
| 71 |
+
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
|
| 72 |
+
|
| 73 |
+
For example:
|
| 74 |
+
|
| 75 |
+
Q: How can I install the library?
|
| 76 |
+
true
|
| 77 |
+
|
| 78 |
+
Q: What is the meaning of life?
|
| 79 |
+
false
|
| 80 |
+
|
| 81 |
+
A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
|
| 82 |
+
"completion_kwargs": {
|
| 83 |
+
"model": "gpt-3.5-turbo",
|
| 84 |
+
"stream": False,
|
| 85 |
+
"temperature": 0,
|
| 86 |
+
},
|
| 87 |
+
},
|
| 88 |
)
|
| 89 |
|
| 90 |
|
rtd_scraper/scrape_rtd.py
CHANGED
|
@@ -1,16 +1,11 @@
|
|
| 1 |
import logging
|
| 2 |
import os
|
| 3 |
|
| 4 |
-
from buster.docparser import get_all_documents
|
| 5 |
-
from buster.documents_manager import DeepLakeDocumentsManager
|
| 6 |
-
from buster.parser import SphinxParser
|
| 7 |
from scrapy.crawler import CrawlerProcess
|
| 8 |
from scrapy.utils.project import get_project_settings
|
| 9 |
|
| 10 |
from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
|
| 11 |
|
| 12 |
-
# from tutorial.spiders.docs_spider import DocsSpider
|
| 13 |
-
|
| 14 |
# When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
|
| 15 |
for name in logging.root.manager.loggerDict:
|
| 16 |
logger = logging.getLogger(name)
|
|
@@ -31,51 +26,3 @@ def run_spider(homepage_url, save_directory, target_version=None):
|
|
| 31 |
|
| 32 |
# To stop the crawling process gracefully
|
| 33 |
process.stop()
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def scrape_rtd(homepage_url, save_directory, target_version=None):
|
| 37 |
-
|
| 38 |
-
# adds https:// and trailing backslash
|
| 39 |
-
homepage_url = sanitize_url(homepage_url)
|
| 40 |
-
|
| 41 |
-
# Crawl the website using scrapy
|
| 42 |
-
run_spider(
|
| 43 |
-
homepage_url, save_directory=save_directory, target_version=target_version
|
| 44 |
-
)
|
| 45 |
-
|
| 46 |
-
# # Convert the .html pages into chunks using Buster's SphinxParser
|
| 47 |
-
root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
|
| 48 |
-
|
| 49 |
-
# root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
|
| 50 |
-
df = get_all_documents(
|
| 51 |
-
root_dir=root_dir,
|
| 52 |
-
base_url=homepage_url,
|
| 53 |
-
parser_cls=SphinxParser,
|
| 54 |
-
min_section_length=100,
|
| 55 |
-
max_section_length=1000,
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
# Add the source column
|
| 59 |
-
df["source"] = "readthedocs"
|
| 60 |
-
|
| 61 |
-
# Initialize the DeepLake vector store
|
| 62 |
-
dm = DeepLakeDocumentsManager(
|
| 63 |
-
vector_store_path=os.path.join(save_directory, "deeplake_store"),
|
| 64 |
-
overwrite=True,
|
| 65 |
-
required_columns=["url", "content", "source", "title"],
|
| 66 |
-
)
|
| 67 |
-
|
| 68 |
-
# Add all embeddings to the vector store
|
| 69 |
-
dm.batch_add(
|
| 70 |
-
df=df,
|
| 71 |
-
batch_size=3000,
|
| 72 |
-
min_time_interval=60,
|
| 73 |
-
num_workers=32,
|
| 74 |
-
)
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
if __name__ == "__main__":
|
| 78 |
-
homepage_url = "https://orion.readthedocs.io/"
|
| 79 |
-
scrape_rtd(
|
| 80 |
-
homepage_url=homepage_url, target_version="v0.2.7", save_directory="outputs/"
|
| 81 |
-
)
|
|
|
|
| 1 |
import logging
|
| 2 |
import os
|
| 3 |
|
|
|
|
|
|
|
|
|
|
| 4 |
from scrapy.crawler import CrawlerProcess
|
| 5 |
from scrapy.utils.project import get_project_settings
|
| 6 |
|
| 7 |
from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
|
| 8 |
|
|
|
|
|
|
|
| 9 |
# When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
|
| 10 |
for name in logging.root.manager.loggerDict:
|
| 11 |
logger = logging.getLogger(name)
|
|
|
|
| 26 |
|
| 27 |
# To stop the crawling process gracefully
|
| 28 |
process.stop()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|