Pudding48 commited on
Commit
d950c3e
·
verified ·
1 Parent(s): eb6eee8

Update prepare_vector_dp.py

Browse files
Files changed (1) hide show
  1. prepare_vector_dp.py +3 -3
prepare_vector_dp.py CHANGED
@@ -36,7 +36,7 @@ def create_db_from_text(request_model):
36
 
37
  text_splitter = CharacterTextSplitter(
38
  separator="\n",
39
- chunk_size=512,
40
  chunk_overlap=50,
41
  length_function=len
42
  )
@@ -77,8 +77,8 @@ def create_dp_from_files(request_model):
77
  print(f"Loaded {len(documents)} documents")
78
  for doc in documents:
79
  print("Preview:", doc.page_content[:300])
80
-
81
- text_splitter = CharacterTextSplitter(chunk_size = 512, chunk_overlap = 50)
82
  chunks = text_splitter.split_documents(documents)
83
 
84
  embedding_model = HuggingFaceEmbeddings(model_name = request_model)
 
36
 
37
  text_splitter = CharacterTextSplitter(
38
  separator="\n",
39
+ chunk_size=400,
40
  chunk_overlap=50,
41
  length_function=len
42
  )
 
77
  print(f"Loaded {len(documents)} documents")
78
  for doc in documents:
79
  print("Preview:", doc.page_content[:300])
80
+ #chunk_size = characters, 1 token = 4 words in English but in Vietnamese it takes less words than that. So change 512 to 400
81
+ text_splitter = CharacterTextSplitter(chunk_size = 400, chunk_overlap = 50)
82
  chunks = text_splitter.split_documents(documents)
83
 
84
  embedding_model = HuggingFaceEmbeddings(model_name = request_model)