ppsingh commited on
Commit
1c8984d
·
verified ·
1 Parent(s): 1170eaf

Update appStore/embed.py

Browse files
Files changed (1) hide show
  1. appStore/embed.py +39 -16
appStore/embed.py CHANGED
@@ -1,36 +1,59 @@
1
- from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
2
- from langchain_community.vectorstores import Qdrant
3
- from qdrant_client import QdrantClient
4
  from langchain_qdrant import FastEmbedSparse, RetrievalMode
5
  from torch import cuda
6
  import streamlit as st
 
 
 
 
7
  # get the device to be used eithe gpu or cpu
8
  device = 'cuda' if cuda.is_available() else 'cpu'
9
 
10
 
11
- def hybrid_embed_chunks(chunks):
12
  """
13
  takes the chunks and does the hybrid embedding for the list of chunks
14
  """
 
 
15
  embeddings = HuggingFaceEmbeddings(
16
  model_kwargs = {'device': device},
17
  encode_kwargs = {'normalize_embeddings': True},
18
  model_name='BAAI/bge-m3'
19
  )
 
 
20
  sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
21
- # placeholder for collection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  print("starting embedding")
23
- #qdrant_collections = {}
24
- Qdrant.from_documents(
25
- chunks,
26
- embeddings,
27
- sparse_embeddings = sparse_embeddings,
28
- path="/data/local_qdrant",
29
- collection_name='giz_worldwide',
30
- retrieval_mode=RetrievalMode.HYBRID,
31
- )
32
-
33
- print(qdrant_collections)
34
  print("vector embeddings done")
35
 
36
  @st.cache_resource
 
1
+ from langchain_qdrant import QdrantVectorStore
 
 
2
  from langchain_qdrant import FastEmbedSparse, RetrievalMode
3
  from torch import cuda
4
  import streamlit as st
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
+ from appStore.prep_utils import get_client
7
+ from qdrant_client.http import models
8
+
9
  # get the device to be used eithe gpu or cpu
10
  device = 'cuda' if cuda.is_available() else 'cpu'
11
 
12
 
13
+ def hybrid_embed_chunks(docs, collection_name):
14
  """
15
  takes the chunks and does the hybrid embedding for the list of chunks
16
  """
17
+
18
+ # Dense Embeddings function
19
  embeddings = HuggingFaceEmbeddings(
20
  model_kwargs = {'device': device},
21
  encode_kwargs = {'normalize_embeddings': True},
22
  model_name='BAAI/bge-m3'
23
  )
24
+
25
+ # Sparse Embedding Function
26
  sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
27
+
28
+ # get exisitng client
29
+ client = get_client()
30
+
31
+ # create collection
32
+ client.create_collection(
33
+ collection_name=collection_name,
34
+ vectors_config={
35
+ "text-dense": models.VectorParams(size=1024, distance=models.Distance.COSINE, on_disk = True)
36
+ },
37
+ sparse_vectors_config={
38
+ "text-sparse": models.SparseVectorParams(index=models.SparseIndexParams(
39
+ on_disk=True,
40
+ )
41
+ )
42
+ },)
43
+
44
+ # create Vector store
45
+ vector_store = QdrantVectorStore(
46
+ client=client,
47
+ collection_name=collection_name,
48
+ embedding=embeddings,
49
+ vector_name="text-dense",
50
+ sparse_embedding = sparse_embeddings,
51
+ sparse_vector_name="text-sparse",
52
+ retrieval_mode=RetrievalMode.HYBRID,
53
+ )
54
+
55
  print("starting embedding")
56
+ vector_store.add_documents(docs)
 
 
 
 
 
 
 
 
 
 
57
  print("vector embeddings done")
58
 
59
  @st.cache_resource