TinyLLamaTest

Runtime error

Pudding48 commited on Jul 13

Commit

d950c3e

verified ·

1 Parent(s): eb6eee8

Update prepare_vector_dp.py

Files changed (1) hide show

prepare_vector_dp.py CHANGED Viewed

@@ -36,7 +36,7 @@ def create_db_from_text(request_model):
     text_splitter = CharacterTextSplitter(
         separator="\n",
-        chunk_size=512,
         chunk_overlap=50,
         length_function=len
     )
@@ -77,8 +77,8 @@ def create_dp_from_files(request_model):
     print(f"Loaded {len(documents)} documents")
     for doc in documents:
         print("Preview:", doc.page_content[:300])
-    text_splitter = CharacterTextSplitter(chunk_size = 512, chunk_overlap = 50)
     chunks = text_splitter.split_documents(documents)
     embedding_model = HuggingFaceEmbeddings(model_name = request_model)

     text_splitter = CharacterTextSplitter(
         separator="\n",
+        chunk_size=400,
         chunk_overlap=50,
         length_function=len
     )
     print(f"Loaded {len(documents)} documents")
     for doc in documents:
         print("Preview:", doc.page_content[:300])
+    #chunk_size = characters, 1 token = 4 words in English but in Vietnamese it takes less words than that. So change 512 to 400
+    text_splitter = CharacterTextSplitter(chunk_size = 400, chunk_overlap = 50)
     chunks = text_splitter.split_documents(documents)
     embedding_model = HuggingFaceEmbeddings(model_name = request_model)