Spaces:

GIZ
/

GIZ-Project-Search

Running on CPU Upgrade

App Files Files Community

ppsingh commited on Dec 10, 2024

Commit

c567921

1 Parent(s): cb359de

add search

Browse files

Files changed (2) hide show

app.py +12 -7
appStore/search.py +47 -2

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import pandas as pd
 from appStore.prep_data import process_giz_worldwide
 from appStore.prep_utils import create_documents, get_client
 from appStore.embed import hybrid_embed_chunks
 from torch import cuda
 # get the device to be used eithe gpu or cpu
 device = 'cuda' if cuda.is_available() else 'cpu'
@@ -19,12 +20,14 @@ var=st.text_input("enter keyword")
 ##### Convert to langchain documents
 #temp_doc = create_documents(chunks,'chunks')
 ##### Embed and store docs, check if collection exist then you need to update the collection
-#collection_name = "giz_worldwide"
 #hybrid_embed_chunks(docs= temp_doc, collection_name = collection_name)
 ################### Hybrid Search ######################################################
 client = get_client()
 print(client.get_collections())
 button=st.button("search")
@@ -32,10 +35,12 @@ button=st.button("search")
 #print(found_docs)
 # results= get_context(vectorstore, f"find the relvant paragraphs for: {var}")
 if button:
-     st.write(f"Found {len(results)} results for query:{var}")
-     for i in results:
-         st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main']))
-         st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}")
-         st.write(i.page_content)
-         st.divider()

 from appStore.prep_data import process_giz_worldwide
 from appStore.prep_utils import create_documents, get_client
 from appStore.embed import hybrid_embed_chunks
+from appStore.search import hybrid_search
 from torch import cuda
 # get the device to be used eithe gpu or cpu
 device = 'cuda' if cuda.is_available() else 'cpu'
 ##### Convert to langchain documents
 #temp_doc = create_documents(chunks,'chunks')
 ##### Embed and store docs, check if collection exist then you need to update the collection
+collection_name = "giz_worldwide"
 #hybrid_embed_chunks(docs= temp_doc, collection_name = collection_name)
 ################### Hybrid Search ######################################################
 client = get_client()
 print(client.get_collections())
+results = hybrid_search(client, var, collection_name)
 button=st.button("search")
 #print(found_docs)
 # results= get_context(vectorstore, f"find the relvant paragraphs for: {var}")
 if button:
+    st.write(f"Showing Top 10 results for query:{var}")
+    st.write(f"Semantic: {len(results[0])}")
+    st.write(f"Semantic: {len(results[1])}")
+    #  for i in results:
+    #      st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main']))
+    #      st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}")
+    #      st.write(i.page_content)
+    #      st.divider()

appStore/search.py CHANGED Viewed

@@ -1,5 +1,50 @@
 from appStore.prep_utils import get_client
-def hybrid_search(client, query):
-    print("wip")

 from appStore.prep_utils import get_client
+from langchain_qdrant import FastEmbedSparse, RetrievalMode
+from torch import cuda
+# get the device to be used eithe gpu or cpu
+device = 'cuda' if cuda.is_available() else 'cpu'
+import streamlit as st
+from langchain_huggingface import HuggingFaceEmbeddings
+def hybrid_search(client, query, collection_name):
+    embeddings = HuggingFaceEmbeddings(
+        model_kwargs = {'device': device},
+        encode_kwargs = {'normalize_embeddings': True},
+        model_name='BAAI/bge-m3'
+    )
+    sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
+    # embed query
+    q_dense = embeddings.embed_query(query)
+    q_sparse = sparse_embeddings.embed_query(query)
+    results = client.search_batch(collection_name=collection_name,
+                        requests=[
+                            models.SearchRequest(
+                                vector=models.NamedVector(
+                                    name="text-dense",
+                                    vector=q_dense,
+                                ),
+                                limit=10,
+                            ),
+                            models.SearchRequest(
+                                vector=models.NamedSparseVector(
+                                    name="text-sparse",
+                                    vector=models.SparseVector(
+                                        indices=q_sparse.indices,
+                                        values=q_sparse.values,
+                                    ),
+                                ),
+                                limit=10,
+                            ),
+                        ],)
+    print(results)
+    return results