Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 1,147 Bytes
e44062d 1170eaf e44062d 1170eaf e44062d a90749d e44062d 4b20d19 1170eaf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
from langchain_text_splitters import TokenTextSplitter
from langchain.docstore.document import Document
import pandas as pd
from qdrant_client import QdrantClient
path_to_qdrant = "/data/local_qdrant"
path_to_data = "./docStore/"
import streamlit as st
def create_chunks(text, tokens_count =400, overlap_count=0):
"""TAKES A TEXT AND CERATES CREATES CHUNKS"""
# chunk size in terms of token
text_splitter = TokenTextSplitter(chunk_size=tokens_count, chunk_overlap = overlap_count)
texts = text_splitter.split_text(text)
return texts
def create_documents(df, text_column):
"""
takes df and creates the langchain doc
"""
placeholder= []
for i in range(len(df)):
metadata = {col: str(df.loc[i, col]) for col in list(set(df.columns) - {text_column})}
placeholder.append(Document(page_content= df.loc[i,text_column],
metadata=metadata))
return placeholder
@st.cache_resource
def get_client():
if 'qdrant' in st.session_state:
return st.session_state.qdrant
else:
client = QdrantClient(path=path_to_qdrant)
return client |