Spaces:
Runtime error
Runtime error
Update prepare_vector_dp.py
Browse files- prepare_vector_dp.py +3 -3
prepare_vector_dp.py
CHANGED
|
@@ -36,7 +36,7 @@ def create_db_from_text(request_model):
|
|
| 36 |
|
| 37 |
text_splitter = CharacterTextSplitter(
|
| 38 |
separator="\n",
|
| 39 |
-
chunk_size=
|
| 40 |
chunk_overlap=50,
|
| 41 |
length_function=len
|
| 42 |
)
|
|
@@ -77,8 +77,8 @@ def create_dp_from_files(request_model):
|
|
| 77 |
print(f"Loaded {len(documents)} documents")
|
| 78 |
for doc in documents:
|
| 79 |
print("Preview:", doc.page_content[:300])
|
| 80 |
-
|
| 81 |
-
text_splitter = CharacterTextSplitter(chunk_size =
|
| 82 |
chunks = text_splitter.split_documents(documents)
|
| 83 |
|
| 84 |
embedding_model = HuggingFaceEmbeddings(model_name = request_model)
|
|
|
|
| 36 |
|
| 37 |
text_splitter = CharacterTextSplitter(
|
| 38 |
separator="\n",
|
| 39 |
+
chunk_size=400,
|
| 40 |
chunk_overlap=50,
|
| 41 |
length_function=len
|
| 42 |
)
|
|
|
|
| 77 |
print(f"Loaded {len(documents)} documents")
|
| 78 |
for doc in documents:
|
| 79 |
print("Preview:", doc.page_content[:300])
|
| 80 |
+
#chunk_size = characters, 1 token = 4 words in English but in Vietnamese it takes less words than that. So change 512 to 400
|
| 81 |
+
text_splitter = CharacterTextSplitter(chunk_size = 400, chunk_overlap = 50)
|
| 82 |
chunks = text_splitter.split_documents(documents)
|
| 83 |
|
| 84 |
embedding_model = HuggingFaceEmbeddings(model_name = request_model)
|