Spaces:
Runtime error
Runtime error
from langchain_community.document_loaders.csv_loader import CSVLoader | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
import torch | |
from langchain_huggingface import HuggingFaceEmbeddings | |
from langchain_chroma import Chroma | |
import logging | |
logger = logging.getLogger(__name__) | |
file_path="./paul_graham_essays.csv" | |
db_persist_directory = './docs/chroma/' | |
def load_data(): | |
logger.info(f'Instantiating CSVLoader with file_path={file_path}') | |
loader = CSVLoader( | |
file_path=file_path, | |
csv_args={ | |
"delimiter": ",", | |
"fieldnames": ['id', 'title', 'date', 'text'], | |
}, | |
source_column='title', | |
metadata_columns=['date'], | |
content_columns=['text'], | |
) | |
logger.info('Instantiating CSVLoader complete') | |
logger.info('Loading data') | |
data = loader.load() | |
logger.info('Loading data complete') | |
logger.info('Returning data') | |
return data[1:] | |
def split_data(data, chunk_size, chunk_overlap): | |
logger.info(f'Instantiating RecursiveCharacterTextSplitter with chunk_size={chunk_size} and chunk_overlap={chunk_overlap}') | |
splitter = RecursiveCharacterTextSplitter( | |
chunk_size=chunk_size, | |
chunk_overlap=chunk_overlap, | |
separators=['\n\n', '\n', '(?<=\. )', ' ', ''] | |
) | |
logger.info('Instantiating RecursiveCharacterTextSplitter complete') | |
logger.info('Generating and returning splits') | |
return splitter.split_documents(data) | |
def generate_embeddings(model_name): | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
model_kwargs = {'device': device} | |
encode_kwargs = {'normalize_embeddings': False} | |
logger.info(f'Instantiating and returning HuggingFaceEmbeddings with model_name={model_name}, model_kwargs={model_kwargs} and encode_kwargs={encode_kwargs}') | |
return HuggingFaceEmbeddings( | |
model_name=model_name, | |
model_kwargs=model_kwargs, | |
encode_kwargs=encode_kwargs, | |
) | |
def get_db( | |
chunk_size=1000, | |
chunk_overlap=200, | |
model_name = 'intfloat/multilingual-e5-large-instruct', | |
): | |
logger.info('Getting data') | |
data = load_data() | |
logger.info('Getting splits') | |
splits = split_data(data, chunk_size, chunk_overlap) | |
logger.info('Getting embedding') | |
embedding = generate_embeddings(model_name) | |
logger.info(f'Instantiating and returning Chroma DB with persist_directory={db_persist_directory}') | |
return Chroma.from_documents( | |
documents=splits, | |
embedding=embedding, | |
persist_directory=db_persist_directory, | |
) | |