Spaces:
Sleeping
Sleeping
# Utilities to build a RAG system to query information from the | |
# gwIAS search pipeline using Langchain | |
# Thanks to Pablo Villanueva Domingo for sharing his CAMELS template | |
# https://huggingface.co/spaces/PabloVD/CAMELSDocBot | |
from langchain import hub | |
from langchain_chroma import Chroma | |
from langchain_core.output_parsers import StrOutputParser | |
from langchain_core.runnables import RunnablePassthrough | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import WebBaseLoader | |
# Load documentation from urls | |
def load_docs(): | |
# Get urls | |
urlsfile = open("urls.txt") | |
urls = urlsfile.readlines() | |
urls = [url.replace("\n","") for url in urls] | |
urlsfile.close() | |
# Load, chunk and index the contents of the blog. | |
loader = WebBaseLoader(urls) | |
docs = loader.load() | |
# Add source URLs as document names for reference | |
for i, doc in enumerate(docs): | |
if 'source' in doc.metadata: | |
doc.metadata['name'] = doc.metadata['source'] | |
else: | |
doc.metadata['name'] = f"Document {i+1}" | |
print(f"Loaded {len(docs)} documents:") | |
for doc in docs: | |
print(f" - {doc.metadata.get('name')}") | |
return docs | |
def extract_reference(url): | |
"""Extract a reference keyword from the GitHub URL""" | |
if "blob/main" in url: | |
return url.split("blob/main/")[-1] | |
elif "tree/main" in url: | |
return url.split("tree/main/")[-1] or "root" | |
return url | |
# Join content pages for processing | |
def format_docs(docs): | |
formatted_docs = [] | |
for doc in docs: | |
source = doc.metadata.get('source', 'Unknown source') | |
reference = f"[{extract_reference(source)}]" | |
content = doc.page_content | |
formatted_docs.append(f"{content}\n\nReference: {reference}") | |
return "\n\n---\n\n".join(formatted_docs) | |
# Create a RAG chain | |
def RAG(llm, docs, embeddings): | |
# Split text | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
splits = text_splitter.split_documents(docs) | |
# Create vector store | |
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings) | |
# Retrieve and generate using the relevant snippets of the documents | |
retriever = vectorstore.as_retriever() | |
# Prompt basis example for RAG systems | |
prompt = hub.pull("rlm/rag-prompt") | |
# Adding custom instructions to the prompt | |
template = prompt.messages[0].prompt.template | |
template_parts = template.split("\nQuestion: {question}") | |
combined_template = "You are an assistant for question-answering tasks. "\ | |
+ "Use the following pieces of retrieved context to answer the question. "\ | |
+ "If you don't know the answer, just say that you don't know. "\ | |
+ "Use six sentences maximum and keep the answer concise. "\ | |
+ "Write the names of the relevant functions from the retrived code. "\ | |
+ "Include the reference IDs in square brackets at the end of your answer."\ | |
+ template_parts[1] | |
prompt.messages[0].prompt.template = combined_template | |
# Create the chain | |
rag_chain = ( | |
{"context": retriever | format_docs, "question": RunnablePassthrough()} | |
| prompt | |
| llm | |
| StrOutputParser() | |
) | |
return rag_chain |