Spaces:
Sleeping
Sleeping
from langchain_text_splitters import TokenTextSplitter | |
from langchain.docstore.document import Document | |
import pandas as pd | |
from qdrant_client import QdrantClient | |
path_to_qdrant = "/data/local_qdrant" | |
path_to_data = "./docStore/" | |
import streamlit as st | |
def create_chunks(text, tokens_count =400, overlap_count=0): | |
"""TAKES A TEXT AND CERATES CREATES CHUNKS""" | |
# chunk size in terms of token | |
text_splitter = TokenTextSplitter(chunk_size=tokens_count, chunk_overlap = overlap_count) | |
texts = text_splitter.split_text(text) | |
return texts | |
def create_documents(df, text_column): | |
""" | |
takes df and creates the langchain doc | |
""" | |
placeholder= [] | |
for i in range(len(df)): | |
metadata = {col: str(df.loc[i, col]) for col in list(set(df.columns) - {text_column})} | |
placeholder.append(Document(page_content= df.loc[i,text_column], | |
metadata=metadata)) | |
return placeholder | |
def get_client(): | |
if 'qdrant' in st.session_state: | |
return st.session_state.qdrant | |
else: | |
client = QdrantClient(path=path_to_qdrant) | |
return client |