abhinand2 commited on
Commit
66f2ba6
·
verified ·
1 Parent(s): 094d0e0

Update db.py

Browse files
Files changed (1) hide show
  1. db.py +22 -5
db.py CHANGED
@@ -4,12 +4,16 @@ import torch
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
  from langchain_chroma import Chroma
6
 
 
 
 
7
 
8
  file_path="./paul_graham_essays.csv"
9
  db_persist_directory = './docs/chroma/'
10
 
11
 
12
  def load_data():
 
13
  loader = CSVLoader(
14
  file_path=file_path,
15
  csv_args={
@@ -20,29 +24,38 @@ def load_data():
20
  metadata_columns=['date'],
21
  content_columns=['text'],
22
  )
 
23
 
 
24
  data = loader.load()
 
 
 
25
  return data[1:]
26
 
27
 
28
  def split_data(data, chunk_size, chunk_overlap):
 
29
  splitter = RecursiveCharacterTextSplitter(
30
  chunk_size=chunk_size,
31
  chunk_overlap=chunk_overlap,
32
  separators=['\n\n', '\n', '(?<=\. )', ' ', '']
33
  )
 
34
 
 
35
  return splitter.split_documents(data)
36
 
37
 
38
- def generate_embeddings(model_path):
39
  device = "cuda" if torch.cuda.is_available() else "cpu"
40
  model_kwargs = {'device': device}
41
 
42
  encode_kwargs = {'normalize_embeddings': False}
43
 
 
44
  return HuggingFaceEmbeddings(
45
- model_name=model_path,
46
  model_kwargs=model_kwargs,
47
  encode_kwargs=encode_kwargs,
48
  )
@@ -51,14 +64,18 @@ def generate_embeddings(model_path):
51
  def get_db(
52
  chunk_size=1000,
53
  chunk_overlap=200,
54
- model_path = 'intfloat/multilingual-e5-large-instruct',
55
  ):
 
56
  data = load_data()
57
 
 
58
  splits = split_data(data, chunk_size, chunk_overlap)
59
 
60
- embedding = generate_embeddings(model_path)
61
-
 
 
62
  return Chroma.from_documents(
63
  documents=splits,
64
  embedding=embedding,
 
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
  from langchain_chroma import Chroma
6
 
7
+ import logging
8
+ logger = logging.getLogger(__name__)
9
+
10
 
11
  file_path="./paul_graham_essays.csv"
12
  db_persist_directory = './docs/chroma/'
13
 
14
 
15
  def load_data():
16
+ logger.info(f'Instantiating CSVLoader with file_path={file_path}')
17
  loader = CSVLoader(
18
  file_path=file_path,
19
  csv_args={
 
24
  metadata_columns=['date'],
25
  content_columns=['text'],
26
  )
27
+ logger.info('Instantiating CSVLoader complete')
28
 
29
+ logger.info('Loading data')
30
  data = loader.load()
31
+ logger.info('Loading data complete')
32
+
33
+ logger.info('Returning data')
34
  return data[1:]
35
 
36
 
37
  def split_data(data, chunk_size, chunk_overlap):
38
+ logger.info(f'Instantiating RecursiveCharacterTextSplitter with chunk_size={chunk_size} and chunk_overlap={chunk_overlap}')
39
  splitter = RecursiveCharacterTextSplitter(
40
  chunk_size=chunk_size,
41
  chunk_overlap=chunk_overlap,
42
  separators=['\n\n', '\n', '(?<=\. )', ' ', '']
43
  )
44
+ logger.info('Instantiating RecursiveCharacterTextSplitter complete')
45
 
46
+ logger.info('Generating and returning splits')
47
  return splitter.split_documents(data)
48
 
49
 
50
+ def generate_embeddings(model_name):
51
  device = "cuda" if torch.cuda.is_available() else "cpu"
52
  model_kwargs = {'device': device}
53
 
54
  encode_kwargs = {'normalize_embeddings': False}
55
 
56
+ logger.info(f'Instantiating and returning HuggingFaceEmbeddings with model_name={model_name}, model_kwargs={model_kwargs} and encode_kwargs={encode_kwargs}')
57
  return HuggingFaceEmbeddings(
58
+ model_name=model_name,
59
  model_kwargs=model_kwargs,
60
  encode_kwargs=encode_kwargs,
61
  )
 
64
  def get_db(
65
  chunk_size=1000,
66
  chunk_overlap=200,
67
+ model_name = 'intfloat/multilingual-e5-large-instruct',
68
  ):
69
+ logger.info('Getting data')
70
  data = load_data()
71
 
72
+ logger.info('Getting splits')
73
  splits = split_data(data, chunk_size, chunk_overlap)
74
 
75
+ logger.info('Getting embedding')
76
+ embedding = generate_embeddings(model_name)
77
+
78
+ logger.info(f'Instantiating and returning Chroma DB with persist_directory={db_persist_directory}')
79
  return Chroma.from_documents(
80
  documents=splits,
81
  embedding=embedding,