abhinand2 commited on
Commit
bfe9335
·
verified ·
1 Parent(s): 5feb488

Rename and update db.py

Browse files
Files changed (1) hide show
  1. setup_db.py → db.py +13 -8
setup_db.py → db.py RENAMED
@@ -6,7 +6,6 @@ from langchain_chroma import Chroma
6
 
7
 
8
  file_path="./paul_graham_essays.csv"
9
- model_path = 'intfloat/multilingual-e5-large-instruct'
10
  db_persist_directory = './docs/chroma/'
11
 
12
 
@@ -26,14 +25,14 @@ def load_data():
26
  return data[1:]
27
 
28
 
29
- def split_docs(docs):
30
  splitter = RecursiveCharacterTextSplitter(
31
- chunk_size=1000,
32
- chunk_overlap=200,
33
  separators=['\n\n', '\n', '(?<=\. )', ' ', '']
34
  )
35
 
36
- return splitter.split_documents(docs)
37
 
38
 
39
  def generate_embeddings():
@@ -49,8 +48,14 @@ def generate_embeddings():
49
  )
50
 
51
 
52
- def get_db():
53
- splits = split_docs(load_data())
 
 
 
 
 
 
54
 
55
  embedding = generate_embeddings()
56
 
@@ -58,4 +63,4 @@ def get_db():
58
  documents=splits,
59
  embedding=embedding,
60
  persist_directory=persist_directory,
61
- )
 
6
 
7
 
8
  file_path="./paul_graham_essays.csv"
 
9
  db_persist_directory = './docs/chroma/'
10
 
11
 
 
25
  return data[1:]
26
 
27
 
28
+ def split_data(data, chunk_size, chunk_overlap):
29
  splitter = RecursiveCharacterTextSplitter(
30
+ chunk_size=chunk_size,
31
+ chunk_overlap=chunk_overlap,
32
  separators=['\n\n', '\n', '(?<=\. )', ' ', '']
33
  )
34
 
35
+ return splitter.split_documents(data)
36
 
37
 
38
  def generate_embeddings():
 
48
  )
49
 
50
 
51
+ def get_db(
52
+ chunk_size=1000,
53
+ chunk_overlap=200,
54
+ model_path = 'intfloat/multilingual-e5-large-instruct',
55
+ ):
56
+ data = load_data()
57
+
58
+ splits = split_data(data, chunk_size, chunk_overlap)
59
 
60
  embedding = generate_embeddings()
61
 
 
63
  documents=splits,
64
  embedding=embedding,
65
  persist_directory=persist_directory,
66
+ )