Spaces:
Runtime error
Runtime error
Update prepare_vector_dp.py
Browse files- prepare_vector_dp.py +3 -3
prepare_vector_dp.py
CHANGED
@@ -36,7 +36,7 @@ def create_db_from_text(request_model):
|
|
36 |
|
37 |
text_splitter = CharacterTextSplitter(
|
38 |
separator="\n",
|
39 |
-
chunk_size=
|
40 |
chunk_overlap=50,
|
41 |
length_function=len
|
42 |
)
|
@@ -77,8 +77,8 @@ def create_dp_from_files(request_model):
|
|
77 |
print(f"Loaded {len(documents)} documents")
|
78 |
for doc in documents:
|
79 |
print("Preview:", doc.page_content[:300])
|
80 |
-
|
81 |
-
text_splitter = CharacterTextSplitter(chunk_size =
|
82 |
chunks = text_splitter.split_documents(documents)
|
83 |
|
84 |
embedding_model = HuggingFaceEmbeddings(model_name = request_model)
|
|
|
36 |
|
37 |
text_splitter = CharacterTextSplitter(
|
38 |
separator="\n",
|
39 |
+
chunk_size=400,
|
40 |
chunk_overlap=50,
|
41 |
length_function=len
|
42 |
)
|
|
|
77 |
print(f"Loaded {len(documents)} documents")
|
78 |
for doc in documents:
|
79 |
print("Preview:", doc.page_content[:300])
|
80 |
+
#chunk_size = characters, 1 token = 4 words in English but in Vietnamese it takes less words than that. So change 512 to 400
|
81 |
+
text_splitter = CharacterTextSplitter(chunk_size = 400, chunk_overlap = 50)
|
82 |
chunks = text_splitter.split_documents(documents)
|
83 |
|
84 |
embedding_model = HuggingFaceEmbeddings(model_name = request_model)
|