ppsingh commited on
Commit
5170600
·
1 Parent(s): 71dcf5a

hybrid test

Browse files
Files changed (2) hide show
  1. app.py +9 -25
  2. appStore/embed.py +31 -0
app.py CHANGED
@@ -7,8 +7,9 @@ from qdrant_client import QdrantClient
7
  from langchain.retrievers import ContextualCompressionRetriever
8
  from langchain.retrievers.document_compressors import CrossEncoderReranker
9
  from langchain_community.cross_encoders import HuggingFaceCrossEncoder
10
- from langchain_qdrant import FastEmbedSparse, RetrievalMode
11
  from appStore.prep_data import process_giz_worldwide
 
 
12
 
13
  # get the device to be used eithe gpu or cpu
14
  device = 'cuda' if cuda.is_available() else 'cpu'
@@ -18,28 +19,6 @@ st.set_page_config(page_title="SEARCH IATI",layout='wide')
18
  st.title("SEARCH IATI Database")
19
  var=st.text_input("enter keyword")
20
 
21
- def embed_chunks(chunks):
22
- """
23
- takes the chunks and does the hybrid embedding for the list of chunks
24
- """
25
- embeddings = HuggingFaceEmbeddings(
26
- model_kwargs = {'device': device},
27
- encode_kwargs = {'normalize_embeddings': True},
28
- model_name='BAAI/bge-m3'
29
- )
30
- #sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
31
- # placeholder for collection
32
- print("starting embedding")
33
- qdrant_collections = {}
34
- qdrant_collections['all'] = Qdrant.from_documents(
35
- chunks,
36
- embeddings,
37
- path="/data/local_qdrant",
38
- collection_name='all',
39
- )
40
-
41
- print(qdrant_collections)
42
- print("vector embeddings done")
43
 
44
  @st.cache_resource
45
  def get_local_qdrant():
@@ -77,7 +56,12 @@ def get_context(vectorstore,query):
77
  # first we create the chunks for iati documents
78
  chunks = process_giz_worldwide()
79
  for i in range(5):
80
- print(chunks.loc[0,'chunks'])
 
 
 
 
 
81
  #print("chunking done")
82
 
83
  # once the chunks are done, we perform hybrid emebddings
@@ -85,7 +69,7 @@ for i in range(5):
85
 
86
  # vectorstores = get_local_qdrant()
87
  # vectorstore = vectorstores['all']
88
- # button=st.button("search")
89
  # results= get_context(vectorstore, f"find the relvant paragraphs for: {var}")
90
  if button:
91
  st.write(f"Found {len(results)} results for query:{var}")
 
7
  from langchain.retrievers import ContextualCompressionRetriever
8
  from langchain.retrievers.document_compressors import CrossEncoderReranker
9
  from langchain_community.cross_encoders import HuggingFaceCrossEncoder
 
10
  from appStore.prep_data import process_giz_worldwide
11
+ from appStore.prep_utils import create_documents
12
+ from appStore.embed import hybrid_embed_chunks
13
 
14
  # get the device to be used eithe gpu or cpu
15
  device = 'cuda' if cuda.is_available() else 'cpu'
 
19
  st.title("SEARCH IATI Database")
20
  var=st.text_input("enter keyword")
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  @st.cache_resource
24
  def get_local_qdrant():
 
56
  # first we create the chunks for iati documents
57
  chunks = process_giz_worldwide()
58
  for i in range(5):
59
+ print(i,"\n",chunks.loc[i,'chunks'])
60
+ temp_df = chunks[:5]
61
+ temp_doc = create_documents(temp_df,'chunks')
62
+ hybrid_embed_chunks(temp_doc)
63
+
64
+
65
  #print("chunking done")
66
 
67
  # once the chunks are done, we perform hybrid emebddings
 
69
 
70
  # vectorstores = get_local_qdrant()
71
  # vectorstore = vectorstores['all']
72
+ button=st.button("search")
73
  # results= get_context(vectorstore, f"find the relvant paragraphs for: {var}")
74
  if button:
75
  st.write(f"Found {len(results)} results for query:{var}")
appStore/embed.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
2
+ from langchain_community.vectorstores import Qdrant
3
+ from langchain_qdrant import FastEmbedSparse, RetrievalMode
4
+ # get the device to be used eithe gpu or cpu
5
+ device = 'cuda' if cuda.is_available() else 'cpu'
6
+
7
+
8
+ def hybrid_embed_chunks(chunks):
9
+ """
10
+ takes the chunks and does the hybrid embedding for the list of chunks
11
+ """
12
+ embeddings = HuggingFaceEmbeddings(
13
+ model_kwargs = {'device': device},
14
+ encode_kwargs = {'normalize_embeddings': True},
15
+ model_name='BAAI/bge-m3'
16
+ )
17
+ sparse_embeddings = FastEmbedSparse(model_name="Qdrant/bm25")
18
+ # placeholder for collection
19
+ print("starting embedding")
20
+ #qdrant_collections = {}
21
+ Qdrant.from_documents(
22
+ chunks,
23
+ embeddings,
24
+ sparse_embeddings = sparse_embeddings,
25
+ path="/data/local_qdrant",
26
+ collection_name='giz_worldwide',
27
+ retrieval_mode=RetrievalMode.HYBRID,
28
+ )
29
+
30
+ print(qdrant_collections)
31
+ print("vector embeddings done")