Spaces:
Restarting
Restarting
Update db.py
Browse files
db.py
CHANGED
@@ -4,12 +4,16 @@ import torch
|
|
4 |
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
from langchain_chroma import Chroma
|
6 |
|
|
|
|
|
|
|
7 |
|
8 |
file_path="./paul_graham_essays.csv"
|
9 |
db_persist_directory = './docs/chroma/'
|
10 |
|
11 |
|
12 |
def load_data():
|
|
|
13 |
loader = CSVLoader(
|
14 |
file_path=file_path,
|
15 |
csv_args={
|
@@ -20,29 +24,38 @@ def load_data():
|
|
20 |
metadata_columns=['date'],
|
21 |
content_columns=['text'],
|
22 |
)
|
|
|
23 |
|
|
|
24 |
data = loader.load()
|
|
|
|
|
|
|
25 |
return data[1:]
|
26 |
|
27 |
|
28 |
def split_data(data, chunk_size, chunk_overlap):
|
|
|
29 |
splitter = RecursiveCharacterTextSplitter(
|
30 |
chunk_size=chunk_size,
|
31 |
chunk_overlap=chunk_overlap,
|
32 |
separators=['\n\n', '\n', '(?<=\. )', ' ', '']
|
33 |
)
|
|
|
34 |
|
|
|
35 |
return splitter.split_documents(data)
|
36 |
|
37 |
|
38 |
-
def generate_embeddings(
|
39 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
40 |
model_kwargs = {'device': device}
|
41 |
|
42 |
encode_kwargs = {'normalize_embeddings': False}
|
43 |
|
|
|
44 |
return HuggingFaceEmbeddings(
|
45 |
-
model_name=
|
46 |
model_kwargs=model_kwargs,
|
47 |
encode_kwargs=encode_kwargs,
|
48 |
)
|
@@ -51,14 +64,18 @@ def generate_embeddings(model_path):
|
|
51 |
def get_db(
|
52 |
chunk_size=1000,
|
53 |
chunk_overlap=200,
|
54 |
-
|
55 |
):
|
|
|
56 |
data = load_data()
|
57 |
|
|
|
58 |
splits = split_data(data, chunk_size, chunk_overlap)
|
59 |
|
60 |
-
embedding
|
61 |
-
|
|
|
|
|
62 |
return Chroma.from_documents(
|
63 |
documents=splits,
|
64 |
embedding=embedding,
|
|
|
4 |
from langchain_huggingface import HuggingFaceEmbeddings
|
5 |
from langchain_chroma import Chroma
|
6 |
|
7 |
+
import logging
|
8 |
+
logger = logging.getLogger(__name__)
|
9 |
+
|
10 |
|
11 |
file_path="./paul_graham_essays.csv"
|
12 |
db_persist_directory = './docs/chroma/'
|
13 |
|
14 |
|
15 |
def load_data():
|
16 |
+
logger.info(f'Instantiating CSVLoader with file_path={file_path}')
|
17 |
loader = CSVLoader(
|
18 |
file_path=file_path,
|
19 |
csv_args={
|
|
|
24 |
metadata_columns=['date'],
|
25 |
content_columns=['text'],
|
26 |
)
|
27 |
+
logger.info('Instantiating CSVLoader complete')
|
28 |
|
29 |
+
logger.info('Loading data')
|
30 |
data = loader.load()
|
31 |
+
logger.info('Loading data complete')
|
32 |
+
|
33 |
+
logger.info('Returning data')
|
34 |
return data[1:]
|
35 |
|
36 |
|
37 |
def split_data(data, chunk_size, chunk_overlap):
|
38 |
+
logger.info(f'Instantiating RecursiveCharacterTextSplitter with chunk_size={chunk_size} and chunk_overlap={chunk_overlap}')
|
39 |
splitter = RecursiveCharacterTextSplitter(
|
40 |
chunk_size=chunk_size,
|
41 |
chunk_overlap=chunk_overlap,
|
42 |
separators=['\n\n', '\n', '(?<=\. )', ' ', '']
|
43 |
)
|
44 |
+
logger.info('Instantiating RecursiveCharacterTextSplitter complete')
|
45 |
|
46 |
+
logger.info('Generating and returning splits')
|
47 |
return splitter.split_documents(data)
|
48 |
|
49 |
|
50 |
+
def generate_embeddings(model_name):
|
51 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
52 |
model_kwargs = {'device': device}
|
53 |
|
54 |
encode_kwargs = {'normalize_embeddings': False}
|
55 |
|
56 |
+
logger.info(f'Instantiating and returning HuggingFaceEmbeddings with model_name={model_name}, model_kwargs={model_kwargs} and encode_kwargs={encode_kwargs}')
|
57 |
return HuggingFaceEmbeddings(
|
58 |
+
model_name=model_name,
|
59 |
model_kwargs=model_kwargs,
|
60 |
encode_kwargs=encode_kwargs,
|
61 |
)
|
|
|
64 |
def get_db(
|
65 |
chunk_size=1000,
|
66 |
chunk_overlap=200,
|
67 |
+
model_name = 'intfloat/multilingual-e5-large-instruct',
|
68 |
):
|
69 |
+
logger.info('Getting data')
|
70 |
data = load_data()
|
71 |
|
72 |
+
logger.info('Getting splits')
|
73 |
splits = split_data(data, chunk_size, chunk_overlap)
|
74 |
|
75 |
+
logger.info('Getting embedding')
|
76 |
+
embedding = generate_embeddings(model_name)
|
77 |
+
|
78 |
+
logger.info(f'Instantiating and returning Chroma DB with persist_directory={db_persist_directory}')
|
79 |
return Chroma.from_documents(
|
80 |
documents=splits,
|
81 |
embedding=embedding,
|