import streamlit as st import pandas as pd from torch import cuda from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings from langchain_community.vectorstores import Qdrant from qdrant_client import QdrantClient from langchain.retrievers import ContextualCompressionRetriever from langchain.retrievers.document_compressors import CrossEncoderReranker from langchain_community.cross_encoders import HuggingFaceCrossEncoder from appStore.prep_data import process_giz_worldwide from appStore.prep_utils import create_documents from appStore.embed import hybrid_embed_chunks, get_local_qdrant # get the device to be used eithe gpu or cpu device = 'cuda' if cuda.is_available() else 'cpu' st.set_page_config(page_title="SEARCH IATI",layout='wide') st.title("SEARCH IATI Database") var=st.text_input("enter keyword") def get_context(vectorstore,query): # create metadata filter # getting context retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.5, "k": 10,}) # # re-ranking the retrieved results # model = HuggingFaceCrossEncoder(model_name=model_config.get('ranker','MODEL')) # compressor = CrossEncoderReranker(model=model, top_n=int(model_config.get('ranker','TOP_K'))) # compression_retriever = ContextualCompressionRetriever( # base_compressor=compressor, base_retriever=retriever # ) context_retrieved = retriever.invoke(query) print(f"retrieved paragraphs:{len(context_retrieved)}") return context_retrieved # first we create the chunks for iati documents chunks = process_giz_worldwide() for i in range(5): print(i,"\n",chunks.loc[i,'chunks']) temp_df = chunks[:5] temp_doc = create_documents(temp_df,'chunks') for i in range(5): print(i,"\n",temp_doc[i]) hybrid_embed_chunks(docs= temp_doc, collection_name = "giz_worldwide") print("emedding done") # once the chunks are done, we perform hybrid emebddings #embed_chunks(chunks) #vectorstores = get_local_qdrant('giz_worldwide') #vectorstore = vectorstores['giz_worldwide'] button=st.button("search") #found_docs = vectorstore.similarity_search(var) #print(found_docs) # results= get_context(vectorstore, f"find the relvant paragraphs for: {var}") if button: st.write(f"Found {len(results)} results for query:{var}") for i in results: st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main'])) st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}") st.write(i.page_content) st.divider()