GIZ-Project-Search / appStore /prep_utils.py
ppsingh's picture
Update appStore/prep_utils.py
4b20d19 verified
raw
history blame
1.15 kB
from langchain_text_splitters import TokenTextSplitter
from langchain.docstore.document import Document
import pandas as pd
from qdrant_client import QdrantClient
path_to_qdrant = "/data/local_qdrant"
path_to_data = "./docStore/"
import streamlit as st
def create_chunks(text, tokens_count =400, overlap_count=0):
"""TAKES A TEXT AND CERATES CREATES CHUNKS"""
# chunk size in terms of token
text_splitter = TokenTextSplitter(chunk_size=tokens_count, chunk_overlap = overlap_count)
texts = text_splitter.split_text(text)
return texts
def create_documents(df, text_column):
"""
takes df and creates the langchain doc
"""
placeholder= []
for i in range(len(df)):
metadata = {col: str(df.loc[i, col]) for col in list(set(df.columns) - {text_column})}
placeholder.append(Document(page_content= df.loc[i,text_column],
metadata=metadata))
return placeholder
@st.cache_resource
def get_client():
if 'qdrant' in st.session_state:
return st.session_state.qdrant
else:
client = QdrantClient(path=path_to_qdrant)
return client