abhinand2 commited on
Commit
dc92420
·
verified ·
1 Parent(s): d46082c

Create setup_db.py

Browse files
Files changed (1) hide show
  1. setup_db.py +61 -0
setup_db.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_community.document_loaders.csv_loader import CSVLoader
2
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
3
+ import torch
4
+ from langchain_huggingface import HuggingFaceEmbeddings
5
+ from langchain_chroma import Chroma
6
+
7
+
8
+ file_path="./paul_graham_essays.csv"
9
+ model_path = 'intfloat/multilingual-e5-large-instruct'
10
+ db_persist_directory = './docs/chroma/'
11
+
12
+
13
+ def load_data():
14
+ loader = CSVLoader(
15
+ file_path=file_path,
16
+ csv_args={
17
+ "delimiter": ",",
18
+ "fieldnames": ['id', 'title', 'date', 'text'],
19
+ },
20
+ source_column='title',
21
+ metadata_columns=['date'],
22
+ content_columns=['text'],
23
+ )
24
+
25
+ data = loader.load()
26
+ return data[1:]
27
+
28
+
29
+ def split_docs(docs):
30
+ splitter = RecursiveCharacterTextSplitter(
31
+ chunk_size=1000,
32
+ chunk_overlap=200,
33
+ separators=['\n\n', '\n', '(?<=\. )', ' ', '']
34
+ )
35
+
36
+ return splitter.split_documents(docs)
37
+
38
+
39
+ def generate_embeddings():
40
+ device = "cuda" if torch.cuda.is_available() else "cpu"
41
+ model_kwargs = {'device': device}
42
+
43
+ encode_kwargs = {'normalize_embeddings': False}
44
+
45
+ return HuggingFaceEmbeddings(
46
+ model_name=model_path,
47
+ model_kwargs=model_kwargs,
48
+ encode_kwargs=encode_kwargs,
49
+ )
50
+
51
+
52
+ def get_db():
53
+ splits = split_docs(load_data())
54
+
55
+ embedding = generate_embeddings()
56
+
57
+ return Chroma.from_documents(
58
+ documents=splits,
59
+ embedding=embedding,
60
+ persist_directory=persist_directory,
61
+ )