Spaces:

GIZ
/

GIZ-Project-Search

Running on CPU Upgrade

App Files Files Community

ppsingh commited on Dec 10, 2024

Commit

1c8984d

verified ·

1 Parent(s): 1170eaf

Update appStore/embed.py

Browse files

Files changed (1) hide show

appStore/embed.py +39 -16

appStore/embed.py CHANGED Viewed

@@ -1,36 +1,59 @@
-from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
-from langchain_community.vectorstores import Qdrant
-from qdrant_client import QdrantClient
 from langchain_qdrant import FastEmbedSparse, RetrievalMode
 from torch import cuda
 import streamlit as st
 # get the device to be used eithe gpu or cpu
 device = 'cuda' if cuda.is_available() else 'cpu'
-def hybrid_embed_chunks(chunks):
     """
     takes the chunks and does the hybrid embedding for the list of chunks
     """
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
         encode_kwargs = {'normalize_embeddings': True},
         model_name='BAAI/bge-m3'
     )
     sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
-    # placeholder for collection
     print("starting embedding")
-    #qdrant_collections = {}
-    Qdrant.from_documents(
-                chunks,
-                embeddings,
-                sparse_embeddings = sparse_embeddings,
-                path="/data/local_qdrant",
-                collection_name='giz_worldwide',
-                retrieval_mode=RetrievalMode.HYBRID,
-            )
-    print(qdrant_collections)
     print("vector embeddings done")
 @st.cache_resource

+from langchain_qdrant import QdrantVectorStore
 from langchain_qdrant import FastEmbedSparse, RetrievalMode
 from torch import cuda
 import streamlit as st
+from langchain_huggingface import HuggingFaceEmbeddings
+from appStore.prep_utils import get_client
+from qdrant_client.http import models
 # get the device to be used eithe gpu or cpu
 device = 'cuda' if cuda.is_available() else 'cpu'
+def hybrid_embed_chunks(docs, collection_name):
     """
     takes the chunks and does the hybrid embedding for the list of chunks
     """
+    # Dense Embeddings function
     embeddings = HuggingFaceEmbeddings(
         model_kwargs = {'device': device},
         encode_kwargs = {'normalize_embeddings': True},
         model_name='BAAI/bge-m3'
     )
+    # Sparse Embedding Function
     sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
+    # get exisitng client
+    client = get_client()
+    # create collection
+    client.create_collection(
+    collection_name=collection_name,
+    vectors_config={
+        "text-dense": models.VectorParams(size=1024, distance=models.Distance.COSINE, on_disk = True)
+    },
+    sparse_vectors_config={
+        "text-sparse": models.SparseVectorParams(index=models.SparseIndexParams(
+                on_disk=True,
+            )
+        )
+    },)
+    # create Vector store
+    vector_store = QdrantVectorStore(
+            client=client,
+            collection_name=collection_name,
+            embedding=embeddings,
+            vector_name="text-dense",
+            sparse_embedding = sparse_embeddings,
+            sparse_vector_name="text-sparse",
+            retrieval_mode=RetrievalMode.HYBRID,
+        )
     print("starting embedding")
+    vector_store.add_documents(docs)
     print("vector embeddings done")
 @st.cache_resource