File size: 1,147 Bytes
e44062d
 
 
1170eaf
 
e44062d
1170eaf
e44062d
 
 
 
 
 
 
 
 
 
 
 
 
 
a90749d
e44062d
 
4b20d19
1170eaf
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from langchain_text_splitters import TokenTextSplitter
from langchain.docstore.document import Document
import pandas as pd
from qdrant_client import QdrantClient
path_to_qdrant = "/data/local_qdrant"
path_to_data = "./docStore/"
import streamlit as st

def create_chunks(text, tokens_count =400, overlap_count=0):
    """TAKES A TEXT AND CERATES CREATES CHUNKS"""
    # chunk size in terms of token
    text_splitter = TokenTextSplitter(chunk_size=tokens_count, chunk_overlap = overlap_count)
    texts = text_splitter.split_text(text)
    return texts 

def create_documents(df, text_column):
    """
    takes df and creates the langchain doc
    """
    placeholder= []
    for i in range(len(df)):
        metadata = {col: str(df.loc[i, col]) for col in list(set(df.columns) - {text_column})}
        placeholder.append(Document(page_content= df.loc[i,text_column], 
                                metadata=metadata))
    return placeholder
    
@st.cache_resource
def get_client():
    if 'qdrant' in st.session_state:
        return st.session_state.qdrant
    else:
        client = QdrantClient(path=path_to_qdrant)
        return client