File size: 851 Bytes
e44062d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a90749d
e44062d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from langchain_text_splitters import TokenTextSplitter
from langchain.docstore.document import Document
import pandas as pd

path_to_data = "./docStore/"

def create_chunks(text, tokens_count =400, overlap_count=0):
    """TAKES A TEXT AND CERATES CREATES CHUNKS"""
    # chunk size in terms of token
    text_splitter = TokenTextSplitter(chunk_size=tokens_count, chunk_overlap = overlap_count)
    texts = text_splitter.split_text(text)
    return texts 

def create_documents(df, text_column):
    """
    takes df and creates the langchain doc
    """
    placeholder= []
    for i in range(len(df)):
        metadata = {col: str(df.loc[i, col]) for col in list(set(df.columns) - {text_column})}
        placeholder.append(Document(page_content= df.loc[i,text_column], 
                                metadata=metadata))
    return placeholder