Spaces:

sabazo
/

insurance_advisor_wb

Sleeping

App Files Files Community

Xalt8 commited on Jul 12, 2024

Commit

466b7d1

1 Parent(s): 15c0646

reranking with chroma fixed

Browse files

Files changed (2) hide show

rag_app/loading_data/load_chroma_db_cross_platform.py +55 -0
rag_app/reranking.py +66 -14

rag_app/loading_data/load_chroma_db_cross_platform.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from pathlib import Path
+import boto3
+from botocore.client import Config
+from botocore import UNSIGNED
+from dotenv import load_dotenv
+import os
+import sys
+import zipfile
+S3_LOCATION = os.getenv("S3_LOCATION")
+def download_chroma_from_s3(s3_location:str,
+                            chroma_vs_name:str,
+                            vectorstore_folder:str,
+                            vs_save_name:str) -> None:
+    """
+    Downloads the Chroma DB from an S3 storage to local folder
+        Args
+            s3_location (str): The name of S3 bucket
+            chroma_vs_name (str): The name of the file to download from S3
+            vectorstore_folder (str): The filepath to vectorstore folder in project dir
+            vs_save_name (str): The name of the vector store
+    """
+    vs_destination = Path()/vectorstore_folder/vs_save_name
+    vs_save_path = vs_destination.with_suffix('.zip')
+    try:
+        # Initialize an S3 client with unsigned configuration for public access
+        s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
+        s3.download_file(s3_location, chroma_vs_name, vs_save_path)
+        # Extract the zip file
+        with zipfile.ZipFile(file=str(vs_save_path), mode='r') as zip_ref:
+            zip_ref.extractall(path=vectorstore_folder)
+    except Exception as e:
+        print(f"Error during downloading or extracting from S3: {e}", file=sys.stderr)
+    # Delete the zip file
+    vs_save_path.unlink()
+if __name__ == "__main__":
+    chroma_vs_name = "vectorstores/chroma-zurich-mpnet-1500.zip"
+    project_dir = Path().cwd().parent
+    vs_destination = str(project_dir / 'vectorstore')
+    assert Path(vs_destination).is_dir(), "Cannot find vectorstore folder"
+    download_chroma_from_s3(s3_location=S3_LOCATION,
+                            chroma_vs_name=chroma_vs_name,
+                            vectorstore_folder=vs_destination,
+                            vs_save_name='chroma-zurich-mpnet-1500')

rag_app/reranking.py CHANGED Viewed

@@ -5,11 +5,13 @@ from dotenv import load_dotenv
 import os
 from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
 import requests
 load_dotenv()
-def get_reranked_docs(query:str,
                       path_to_db:str,
                       embedding_model:str,
                       hf_api_key:str,
@@ -59,22 +61,72 @@ def get_reranked_docs(query:str,
         ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
         top_k_results = ranked_results[:num_docs]
         return [doc for doc, _, _ in top_k_results]
-if __name__ == "__main__":
-    HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
-    path_to_vector_db = Path("..")/'vectorstore/faiss-insurance-agent-500'
-    query = "Ich möchte wissen, ob ich meine geriatrische Haustier-Eidechse versichern kann"
-    top_5_docs = get_reranked_docs(query=query,
-                                   path_to_db=path_to_vector_db,
-                                   embedding_model=EMBEDDING_MODEL,
-                                   hf_api_key=HUGGINGFACEHUB_API_TOKEN,
-                                   num_docs=5)
-    for i, doc in enumerate(top_5_docs):
-        print(f"{i}: {doc}\n")

 import os
 from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
 import requests
+from langchain_community.vectorstores import Chroma
 load_dotenv()
+def get_reranked_docs_faiss(query:str,
                       path_to_db:str,
                       embedding_model:str,
                       hf_api_key:str,
         ranked_results = sorted(zip(docs, passages, relevance_scores), key=lambda x: x[2], reverse=True)
         top_k_results = ranked_results[:num_docs]
         return [doc for doc, _, _ in top_k_results]
+def get_reranked_docs_chroma(query:str,
+                      path_to_db:str,
+                      embedding_model:str,
+                      hf_api_key:str,
+                      reranking_hf_url:str = "https://api-inference.huggingface.co/models/sentence-transformers/all-mpnet-base-v2",
+                      num_docs:int=5) -> list:
+    """ Re-ranks the similarity search results and returns top-k highest ranked docs
+        Args:
+            query (str): The search query
+            path_to_db (str): Path to the vectorstore database
+            embedding_model (str): Embedding model used in the vector store
+            num_docs (int): Number of documents to return
+        Returns: A list of documents with the highest rank
+    """
+    assert num_docs <= 10, "num_docs should be less than similarity search results"
+    embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_api_key,
+                                                   model_name=embedding_model)
+    # Load the vectorstore database
+    db = Chroma(persist_directory=path_to_db, embedding_function=embeddings)
+    # Get 10 documents based on similarity search
+    sim_docs =  db.similarity_search(query=query, k=10)
+    # Add the page_content, description and title together
+    passages = [doc.page_content for doc in sim_docs]
+    # Prepare the payload
+    payload = {"inputs":
+               {"source_sentence": query,
+	            "sentences": passages}}
+    headers = {"Authorization": f"Bearer {hf_api_key}"}
+    response = requests.post(url=reranking_hf_url, headers=headers, json=payload)
+    if response.status_code != 200:
+        print('Something went wrong with the response')
+        return
+    similarity_scores = response.json()
+    ranked_results = sorted(zip(sim_docs, passages, similarity_scores), key=lambda x: x[2], reverse=True)
+    top_k_results = ranked_results[:num_docs]
+    return [doc for doc, _, _ in top_k_results]
+if __name__ == "__main__":
+    HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
+    EMBEDDING_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
+    project_dir = Path().cwd().parent
+    path_to_vector_db = str(project_dir/'vectorstore/chroma-zurich-mpnet-1500')
+    query = "I'm looking for student insurance"
+    re_ranked_docs = get_reranked_docs_chroma(query=query,
+                                              path_to_db= path_to_vector_db,
+                                              embedding_model=EMBEDDING_MODEL,
+                                              hf_api_key=HUGGINGFACEHUB_API_TOKEN)
+    print(f"{re_ranked_docs=}")