# Utilities to build a RAG system to query information from the # gwIAS search pipeline using Langchain # Thanks to Pablo Villanueva Domingo for sharing his CAMELS template # https://huggingface.co/spaces/PabloVD/CAMELSDocBot from langchain import hub from langchain_chroma import Chroma from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnablePassthrough from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import WebBaseLoader # Load documentation from urls def load_docs(): # Get urls urlsfile = open("urls.txt") urls = urlsfile.readlines() urls = [url.replace("\n","") for url in urls] urlsfile.close() # Load, chunk and index the contents of the blog. loader = WebBaseLoader(urls) docs = loader.load() # Add source URLs as document names for reference for i, doc in enumerate(docs): if 'source' in doc.metadata: doc.metadata['name'] = doc.metadata['source'] else: doc.metadata['name'] = f"Document {i+1}" print(f"Loaded {len(docs)} documents:") for doc in docs: print(f" - {doc.metadata.get('name')}") return docs def extract_reference(url): """Extract a reference keyword from the GitHub URL""" if "blob/main" in url: return url.split("blob/main/")[-1] elif "tree/main" in url: return url.split("tree/main/")[-1] or "root" return url # Join content pages for processing def format_docs(docs): formatted_docs = [] for doc in docs: source = doc.metadata.get('source', 'Unknown source') reference = f"[{extract_reference(source)}]" content = doc.page_content formatted_docs.append(f"{content}\n\nReference: {reference}") return "\n\n---\n\n".join(formatted_docs) # Create a RAG chain def RAG(llm, docs, embeddings): # Split text text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) splits = text_splitter.split_documents(docs) # Create vector store vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings) # Retrieve and generate using the relevant snippets of the documents retriever = vectorstore.as_retriever() # Prompt basis example for RAG systems prompt = hub.pull("rlm/rag-prompt") # Adding custom instructions to the prompt template = prompt.messages[0].prompt.template template_parts = template.split("\nQuestion: {question}") combined_template = "You are an assistant for question-answering tasks. "\ + "Use the following pieces of retrieved context to answer the question. "\ + "If you don't know the answer, just say that you don't know. "\ + "Use six sentences maximum and keep the answer concise. "\ + "Write the names of the relevant functions from the retrived code. "\ + "Include the reference IDs in square brackets at the end of your answer."\ + template_parts[1] prompt.messages[0].prompt.template = combined_template # Create the chain rag_chain = ( {"context": retriever | format_docs, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser() ) return rag_chain