Spaces:

jaywadekar
/

gwIAS

Running

App Files Files Community

JayWadekar commited on Mar 1

Commit

699fe5f

1 Parent(s): 60b6fa4

Added code to ouput references

Browse files

Files changed (2) hide show

app.py +0 -3
rag.py +35 -5

app.py CHANGED Viewed

@@ -2,9 +2,6 @@
 #  the gwIAS search pipline
 # using Langchain and deployed with Gradio
-# Thanks to Pablo Villanueva Domingo for sharing his CAMELS template
-# https://huggingface.co/spaces/PabloVD/CAMELSDocBot
 from rag import RAG, load_docs
 from langchain_community.embeddings import HuggingFaceInstructEmbeddings
 from langchain.chat_models import ChatOpenAI

 #  the gwIAS search pipline
 # using Langchain and deployed with Gradio
 from rag import RAG, load_docs
 from langchain_community.embeddings import HuggingFaceInstructEmbeddings
 from langchain.chat_models import ChatOpenAI

rag.py CHANGED Viewed

@@ -23,12 +23,37 @@ def load_docs():
     # Load, chunk and index the contents of the blog.
     loader = WebBaseLoader(urls)
     docs = loader.load()
     return docs
 # Join content pages for processing
 def format_docs(docs):
-    return "\n\n".join(doc.page_content for doc in docs)
 # Create a RAG chain
 def RAG(llm, docs, embeddings):
@@ -45,6 +70,14 @@ def RAG(llm, docs, embeddings):
     # Prompt basis example for RAG systems
     prompt = hub.pull("rlm/rag-prompt")
     # Create the chain
     rag_chain = (
@@ -54,7 +87,4 @@ def RAG(llm, docs, embeddings):
         | StrOutputParser()
     )
-    return rag_chain
-# Debugging push

     # Load, chunk and index the contents of the blog.
     loader = WebBaseLoader(urls)
     docs = loader.load()
+    # Add source URLs as document names for reference
+    for i, doc in enumerate(docs):
+        if 'source' in doc.metadata:
+            doc.metadata['name'] = doc.metadata['source']
+        else:
+            doc.metadata['name'] = f"Document {i+1}"
+    print(f"Loaded {len(docs)} documents:")
+    for doc in docs:
+        print(f" - {doc.metadata.get('name')}")
     return docs
+def extract_reference(url):
+    """Extract a reference keyword from the GitHub URL"""
+    if "blob/main" in url:
+        return url.split("blob/main/")[-1]
+    elif "tree/main" in url:
+        return url.split("tree/main/")[-1] or "root"
+    return url
 # Join content pages for processing
 def format_docs(docs):
+    formatted_docs = []
+    for doc in docs:
+        source = doc.metadata.get('source', 'Unknown source')
+        reference = f"[{extract_reference(source)}]"
+        content = doc.page_content
+        formatted_docs.append(f"{content}\n\nReference: {reference}")
+    return "\n\n---\n\n".join(formatted_docs)
 # Create a RAG chain
 def RAG(llm, docs, embeddings):
     # Prompt basis example for RAG systems
     prompt = hub.pull("rlm/rag-prompt")
+    # Adding custom instructions to the prompt
+    template = prompt.messages[0].prompt.template
+    template_parts = template.split("\nQuestion: {question}")
+    if len(template_parts) == 2:
+        print("Error: Template does not contain the expected format.")
+    combined_template = template_parts[0] +\
+        " Include the reference IDs in square brackets when citing specific information." + template_parts[1]
+    prompt.messages[0].prompt.template = combined_template
     # Create the chain
     rag_chain = (
         | StrOutputParser()
     )
+    return rag_chain