from langchain_text_splitters import TokenTextSplitter from langchain.docstore.document import Document import pandas as pd from qdrant_client import QdrantClient path_to_qdrant = "/data/local_qdrant" path_to_data = "./docStore/" import streamlit as st def create_chunks(text, tokens_count =400, overlap_count=0): """TAKES A TEXT AND CERATES CREATES CHUNKS""" # chunk size in terms of token text_splitter = TokenTextSplitter(chunk_size=tokens_count, chunk_overlap = overlap_count) texts = text_splitter.split_text(text) return texts def create_documents(df, text_column): """ takes df and creates the langchain doc """ placeholder= [] for i in range(len(df)): metadata = {col: str(df.loc[i, col]) for col in list(set(df.columns) - {text_column})} placeholder.append(Document(page_content= df.loc[i,text_column], metadata=metadata)) return placeholder @st.cache_resource def get_client(): if 'qdrant' in st.session_state: return st.session_state.qdrant else: client = QdrantClient(path=path_to_qdrant) return client