RAGTheDocs-mila-qc

Sleeping

App Files Files Community

jerpint commited on Nov 5, 2023

Commit

2ae8bfe

1 Parent(s): ef80e27

remove unused code, add exceptions if variables not set

Browse files

Files changed (3) hide show

app.py +34 -14
cfg.py +28 -30
rtd_scraper/scrape_rtd.py +0 -53

app.py CHANGED Viewed

@@ -6,34 +6,54 @@ import pandas as pd
 from buster.completers import Completion
 # from embed_docs import embed_rtd_website
-from rtd_scraper.scrape_rtd import scrape_rtd
 import cfg
 from cfg import setup_buster
-# Check if an openai key is set as an env. variable
-if os.getenv("OPENAI_API_KEY") is None:
     print(
-        "Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'."
     )
-homepage_url = os.getenv("READTHEDOCS_URL")  # e.g. "https://orion.readthedocs.io/"
-target_version = os.getenv("READTHEDOCS_VERSION")  # e.g. "en/stable"
-# scrape and embed content from readthedocs website
-# comment out if already embedded locally to avoid extra costs
-scrape_rtd(
-    homepage_url=homepage_url, save_directory="outputs/", target_version=target_version
-)
-# Typehint for chatbot history
-ChatHistory = list[list[Optional[str], Optional[str]]]
 buster = setup_buster(cfg.buster_cfg)
 def add_user_question(
     user_question: str, chat_history: Optional[ChatHistory] = None
 ) -> ChatHistory:
@@ -157,5 +177,5 @@ with demo:
     )
-demo.queue(concurrency_count=16)
 demo.launch(share=False)

 from buster.completers import Completion
 # from embed_docs import embed_rtd_website
+# from rtd_scraper.scrape_rtd import scrape_rtd
+from embed_docs import embed_documents
 import cfg
 from cfg import setup_buster
+# Typehint for chatbot history
+ChatHistory = list[list[Optional[str], Optional[str]]]
+# Because this is a one-click deploy app, we will be relying on env. variables being set
+openai_api_key = os.getenv("OPENAI_API_KEY")  # Mandatory for app to work
+readthedocs_url = os.getenv("READTHEDOCS_URL")  # Mandatory for app to work as intended
+readthedocs_version = os.getenv("READTHEDOCS_VERSION")
+if openai_api_key is None:
     print(
+        "Warning: No OPENAI_API_KEY detected. Set it with 'export OPENAI_API_KEY=sk-...'."
     )
+if readthedocs_url is None:
+    raise ValueError(
+        "No READTHEDOCS_URL detected. Set it with e.g. 'export READTHEDOCS_URL=https://orion.readthedocs.io/'"
+    )
+if readthedocs_version is None:
+    print(
+        """
+    Warning: No READTHEDOCS_VERSION detected. If multiple versions of the docs exist, they will all be scraped.
+    Set it with e.g. 'export READTHEDOCS_VERSION=en/stable'
+    """
+    )
+# Override to put it anywhere
+save_directory = "outputs/"
+# scrape and embed content from readthedocs website
+embed_documents(
+    homepage_url=readthedocs_url,
+    save_directory=save_directory,
+    target_version=readthedocs_version,
+)
+# Setup RAG agent
 buster = setup_buster(cfg.buster_cfg)
+# Setup Gradio app
 def add_user_question(
     user_question: str, chat_history: Optional[ChatHistory] = None
 ) -> ChatHistory:
     )
+demo.queue(concurrency_count=8)
 demo.launch(share=False)

cfg.py CHANGED Viewed

@@ -6,37 +6,7 @@ from buster.retriever import DeepLakeRetriever, Retriever
 from buster.tokenizers import GPTTokenizer
 from buster.validators import QuestionAnswerValidator, Validator
-from rtd_scraper.scrape_rtd import scrape_rtd
 buster_cfg = BusterConfig(
-    validator_cfg={
-        "unknown_response_templates": [
-            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
-        ],
-        "unknown_threshold": 0.85,
-        "embedding_model": "text-embedding-ada-002",
-        "use_reranking": True,
-        "invalid_question_response": "This question does not seem relevant to my current knowledge.",
-        "check_question_prompt": """You are an chatbot answering questions on python libraries.
-Your job is to determine wether or not a question is valid, and should be answered.
-A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
-For example:
-Q: How can I install the library?
-true
-Q: What is the meaning of life?
-false
-A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
-        "completion_kwargs": {
-            "model": "gpt-3.5-turbo",
-            "stream": False,
-            "temperature": 0,
-        },
-    },
     retriever_cfg={
         "path": "outputs/deeplake_store",
         "top_k": 3,
@@ -87,6 +57,34 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
             "Now answer the following question:\n"
         ),
     },
 )

 from buster.tokenizers import GPTTokenizer
 from buster.validators import QuestionAnswerValidator, Validator
 buster_cfg = BusterConfig(
     retriever_cfg={
         "path": "outputs/deeplake_store",
         "top_k": 3,
             "Now answer the following question:\n"
         ),
     },
+    validator_cfg={
+        "unknown_response_templates": [
+            "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
+        ],
+        "unknown_threshold": 0.85,
+        "embedding_model": "text-embedding-ada-002",
+        "use_reranking": True,
+        "invalid_question_response": "This question does not seem relevant to my current knowledge.",
+        "check_question_prompt": """You are an chatbot answering questions on python libraries.
+Your job is to determine wether or not a question is valid, and should be answered.
+A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
+For example:
+Q: How can I install the library?
+true
+Q: What is the meaning of life?
+false
+A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
+        "completion_kwargs": {
+            "model": "gpt-3.5-turbo",
+            "stream": False,
+            "temperature": 0,
+        },
+    },
 )

rtd_scraper/scrape_rtd.py CHANGED Viewed

@@ -1,16 +1,11 @@
 import logging
 import os
-from buster.docparser import get_all_documents
-from buster.documents_manager import DeepLakeDocumentsManager
-from buster.parser import SphinxParser
 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
 from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
-# from tutorial.spiders.docs_spider import DocsSpider
 # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
 for name in logging.root.manager.loggerDict:
     logger = logging.getLogger(name)
@@ -31,51 +26,3 @@ def run_spider(homepage_url, save_directory, target_version=None):
     # To stop the crawling process gracefully
     process.stop()
-def scrape_rtd(homepage_url, save_directory, target_version=None):
-    # adds https:// and trailing backslash
-    homepage_url = sanitize_url(homepage_url)
-    # Crawl the website using scrapy
-    run_spider(
-        homepage_url, save_directory=save_directory, target_version=target_version
-    )
-    # # Convert the .html pages into chunks using Buster's SphinxParser
-    root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
-    # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
-    df = get_all_documents(
-        root_dir=root_dir,
-        base_url=homepage_url,
-        parser_cls=SphinxParser,
-        min_section_length=100,
-        max_section_length=1000,
-    )
-    # Add the source column
-    df["source"] = "readthedocs"
-    #  Initialize the DeepLake vector store
-    dm = DeepLakeDocumentsManager(
-        vector_store_path=os.path.join(save_directory, "deeplake_store"),
-        overwrite=True,
-        required_columns=["url", "content", "source", "title"],
-    )
-    # Add all embeddings to the vector store
-    dm.batch_add(
-        df=df,
-        batch_size=3000,
-        min_time_interval=60,
-        num_workers=32,
-    )
-if __name__ == "__main__":
-    homepage_url = "https://orion.readthedocs.io/"
-    scrape_rtd(
-        homepage_url=homepage_url, target_version="v0.2.7", save_directory="outputs/"
-    )

 import logging
 import os
 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
 from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
 # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
 for name in logging.root.manager.loggerDict:
     logger = logging.getLogger(name)
     # To stop the crawling process gracefully
     process.stop()