Spaces:
Runtime error
Runtime error
Create setup_db.py
Browse files- setup_db.py +61 -0
setup_db.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_community.document_loaders.csv_loader import CSVLoader
|
2 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
3 |
+
import torch
|
4 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
+
from langchain_chroma import Chroma
|
6 |
+
|
7 |
+
|
8 |
+
file_path="./paul_graham_essays.csv"
|
9 |
+
model_path = 'intfloat/multilingual-e5-large-instruct'
|
10 |
+
db_persist_directory = './docs/chroma/'
|
11 |
+
|
12 |
+
|
13 |
+
def load_data():
|
14 |
+
loader = CSVLoader(
|
15 |
+
file_path=file_path,
|
16 |
+
csv_args={
|
17 |
+
"delimiter": ",",
|
18 |
+
"fieldnames": ['id', 'title', 'date', 'text'],
|
19 |
+
},
|
20 |
+
source_column='title',
|
21 |
+
metadata_columns=['date'],
|
22 |
+
content_columns=['text'],
|
23 |
+
)
|
24 |
+
|
25 |
+
data = loader.load()
|
26 |
+
return data[1:]
|
27 |
+
|
28 |
+
|
29 |
+
def split_docs(docs):
|
30 |
+
splitter = RecursiveCharacterTextSplitter(
|
31 |
+
chunk_size=1000,
|
32 |
+
chunk_overlap=200,
|
33 |
+
separators=['\n\n', '\n', '(?<=\. )', ' ', '']
|
34 |
+
)
|
35 |
+
|
36 |
+
return splitter.split_documents(docs)
|
37 |
+
|
38 |
+
|
39 |
+
def generate_embeddings():
|
40 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
41 |
+
model_kwargs = {'device': device}
|
42 |
+
|
43 |
+
encode_kwargs = {'normalize_embeddings': False}
|
44 |
+
|
45 |
+
return HuggingFaceEmbeddings(
|
46 |
+
model_name=model_path,
|
47 |
+
model_kwargs=model_kwargs,
|
48 |
+
encode_kwargs=encode_kwargs,
|
49 |
+
)
|
50 |
+
|
51 |
+
|
52 |
+
def get_db():
|
53 |
+
splits = split_docs(load_data())
|
54 |
+
|
55 |
+
embedding = generate_embeddings()
|
56 |
+
|
57 |
+
return Chroma.from_documents(
|
58 |
+
documents=splits,
|
59 |
+
embedding=embedding,
|
60 |
+
persist_directory=persist_directory,
|
61 |
+
)
|